diff options
Diffstat (limited to 'xlators/cluster/afr')
32 files changed, 25579 insertions, 26461 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 35d18a6c0da..610819b28fc 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,29 +1,28 @@ -xlator_LTLIBRARIES = afr.la pump.la +xlator_LTLIBRARIES = afr.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ - afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ - afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ - afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ + afr-read-txn.c \ $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_LDFLAGS = -module -avoid-version -afr_la_SOURCES = $(afr_common_source) afr.c -afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ + afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ + afr-self-heal-name.c -pump_la_LDFLAGS = -module -avoid-version -pump_la_SOURCES = $(afr_common_source) pump.c -pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +afr_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c +afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ - afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ - afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ - afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ - $(top_builddir)/glusterfsd/src/glusterfsd.h + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ + afr-common.c afr-self-heald.h \ + $(top_builddir)/xlators/lib/src/libxlator.h afr-messages.h AM_CPPFLAGS = $(GF_CPPFLAGS) \ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ - -I$(top_srcdir)/rpc/rpc-lib/src + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) @@ -31,7 +30,6 @@ CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/replicate.so - rm -f $(DESTDIR)$(xlatordir)/pump.so install-data-hook: ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 8b6ab9a7f69..032ab5c8001 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -15,29 +15,20 @@ #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "statedump.h" -#include "inode.h" - -#include "fd.h" +#include <glusterfs/dict.h> +#include <glusterfs/hashfn.h> +#include <glusterfs/list.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/statedump.h> +#include <glusterfs/events.h> +#include <glusterfs/upcall-utils.h> #include "afr-inode-read.h" #include "afr-inode-write.h" @@ -45,4399 +36,7843 @@ #include "afr-dir-write.h" #include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" #include "afr-self-heald.h" -#include "pump.h" +#include "afr-messages.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL +int32_t +afr_quorum_errno(afr_private_t *priv) +{ + return ENOTCONN; +} -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid) { - int i = 0; + if (!__is_root_gfid(pargfid)) { + return _gf_false; + } - for (i = 0; i < child_count; i++) - dst[i] = src[i]; + if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) { + /*For backward compatibility /.landfill is private*/ + return _gf_true; + } + + if (pid == GF_CLIENT_PID_GSYNCD) { + /*geo-rep needs to create/sync private directory on slave because + * it appears in changelog*/ + return _gf_false; + } + + if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) { + if (strcmp(name, priv->anon_inode_name) == 0) { + /* anonymous-inode dir is private*/ + return _gf_true; + } + } else { + if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) == + 0) { + /* anonymous-inode dir prefix is private for geo-rep to work*/ + return _gf_true; + } + } + + return _gf_false; } void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies) { - int i = 0; - afr_private_t *priv = NULL; - int ret = 0; - - priv = this->private; + int i = 0; - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - path, priv->pending_key[i]); - /* 3 = data+metadata+entry */ - } - ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " - "lookup", path); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + replies[i] = 1; + } else { + replies[i] = 0; } + } } int -afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, - dict_t *xattr_req, loc_t *loc, void **gfid_req) -{ - int ret = -ENOMEM; +afr_fav_child_reset_sink_xattrs(void *opaque); - GF_ASSERT (gfid_req); - - *gfid_req = NULL; - local->xattr_req = dict_new (); - if (!local->xattr_req) - goto out; - if (xattr_req) - dict_copy (xattr_req, local->xattr_req); +int +afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque); - afr_xattr_req_prepare (this, local->xattr_req, loc->path); - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } +static void +afr_discover_done(call_frame_t *frame, xlator_t *this); - ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_PARENT_ENTRYLK); - } +int +afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; - ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: failed to get the gfid from dict", loc->path); - *gfid_req = NULL; - } else { - if (loc->parent != NULL) - dict_del (local->xattr_req, "gfid-req"); - } - ret = 0; -out: - return ret; -} + local->cont.lk.dom_lock_op_ret[i] = op_ret; + local->cont.lk.dom_lock_op_errno[i] = op_errno; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to acquire %s on %s", + uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM, + priv->children[i]->name); + } else { + local->cont.lk.dom_locked_nodes[i] = 1; + } -void -afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) -{ - inode_t *inode = NULL; + syncbarrier_wake(&local->barrier); - inode = loc->inode; - if (inode && !uuid_is_null (inode->gfid)) - uuid_copy (dst, inode->gfid); - else if (!uuid_is_null (loc->gfid)) - uuid_copy (dst, loc->gfid); - else if (new && !uuid_is_null (new)) - uuid_copy (dst, new); + return 0; } int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno) -{ - int i = 0; - int errno_count = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - if (child_errno[child] == op_errno) - errno_count++; +afr_dom_lock_acquire(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + int i = 0; + + priv = frame->this->private; + local = frame->local; + local->cont.lk.dom_locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + if (!local->cont.lk.dom_locked_nodes) { + return -ENOMEM; + } + local->cont.lk.dom_lock_op_ret = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_ret) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + local->cont.lk.dom_lock_op_errno = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_errno) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + flock.l_type = F_WRLCK; + + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLK, &flock, NULL); + + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) + goto blocking_lock; + + /*If any of the bricks returned EAGAIN, we still need blocking locks.*/ + if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) != + priv->child_count) { + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.dom_lock_op_ret[i] == -1 && + local->cont.lk.dom_lock_op_errno[i] == EAGAIN) + goto blocking_lock; } - return errno_count; -} + } -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) -{ - int ret = 0; - uuid_t *pgfid = NULL; + return 0; - GF_ASSERT (gfid); +blocking_lock: + afr_dom_lock_release(frame); + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLKW, &flock, NULL); + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) { + afr_dom_lock_release(frame); + return -afr_quorum_errno(priv); + } - pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); - if (!pgfid) { - ret = -1; - goto out; - } + return 0; +} - uuid_copy (*pgfid, gfid); +int +afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; - ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); - if (ret) - gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to release %s on %s", local->loc.path, + AFR_LK_HEAL_DOM, priv->children[i]->name); + } + local->cont.lk.dom_locked_nodes[i] = 0; -out: - if (ret && pgfid) - GF_FREE (pgfid); + syncbarrier_wake(&local->barrier); - return ret; + return 0; } void -afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) -{ - if (!ctx) - return; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); +afr_dom_lock_release(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + struct gf_flock flock = { + 0, + }; + + local = frame->local; + priv = frame->this->private; + locked_on = local->cont.lk.dom_locked_nodes; + if (AFR_COUNT(locked_on, priv->child_count) == 0) + return; + flock.l_type = F_UNLCK; + + AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk, + AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL); + + return; } -afr_inode_ctx_t* -__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +static void +afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info) { - int ret = 0; - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; + if (!info) + return; + if (info->xdata_req) + dict_unref(info->xdata_req); + if (info->fd) + fd_unref(info->fd); + GF_FREE(info->locked_nodes); + GF_FREE(info->child_up_event_gen); + GF_FREE(info->child_down_event_gen); + GF_FREE(info); +} - priv = this->private; - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - if (ctx_addr != 0) { - ctx = (afr_inode_ctx_t*) (long) ctx_addr; - goto out; - } - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto fail; - ctx->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*ctx->fresh_children), - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto fail; - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - goto fail; +static int +afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -ENOMEM; + + info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t); + if (!info) { + goto cleanup; + } + INIT_LIST_HEAD(&info->pos); + info->fd = fd_ref(local->fd); + info->cmd = local->cont.lk.cmd; + info->pid = frame->root->pid; + info->flock = local->cont.lk.user_flock; + info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL); + if (!info->xdata_req) { + goto cleanup; + } + info->lk_owner = frame->root->lk_owner; + info->locked_nodes = GF_MALLOC( + sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char); + if (!info->locked_nodes) { + goto cleanup; + } + memcpy(info->locked_nodes, local->cont.lk.locked_nodes, + sizeof(*info->locked_nodes) * priv->child_count); + info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen), + priv->child_count, gf_afr_mt_int32_t); + if (!info->child_up_event_gen) { + goto cleanup; + } + info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen), + priv->child_count, + gf_afr_mt_int32_t); + if (!info->child_down_event_gen) { + goto cleanup; + } + + LOCK(&local->fd->lock); + { + fd_ctx = __afr_fd_ctx_get(local->fd, this); + if (fd_ctx) + fd_ctx->lk_heal_info = info; + } + UNLOCK(&local->fd->lock); + if (!fd_ctx) { + goto cleanup; + } + + LOCK(&priv->lock); + { + list_add_tail(&info->pos, &priv->saved_locks); + } + UNLOCK(&priv->lock); + + return 0; +cleanup: + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to add lock to healq", + uuid_utoa(local->fd->inode->gfid)); + if (info) { + afr_lk_heal_info_cleanup(info); + if (fd_ctx) { + LOCK(&local->fd->lock); + { + fd_ctx->lk_heal_info = NULL; + } + UNLOCK(&local->fd->lock); } + } + return ret; +} +static int +afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = local->cont.lk.user_flock; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -EINVAL; + + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx || !fd_ctx->lk_heal_info) { + goto out; + } + + info = fd_ctx->lk_heal_info; + if ((info->flock.l_start != flock.l_start) || + (info->flock.l_whence != flock.l_whence) || + (info->flock.l_len != flock.l_len)) { + /*TODO: Compare lkowners too.*/ + goto out; + } + + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + UNLOCK(&priv->lock); + + afr_lk_heal_info_cleanup(info); + fd_ctx->lk_heal_info = NULL; + ret = 0; out: - return ctx; - -fail: - afr_inode_ctx_destroy (ctx); - return NULL; + if (ret) + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to remove lock from healq", + uuid_utoa(local->fd->inode->gfid)); + return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this) +int +afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { - afr_inode_ctx_t *ctx = NULL; + afr_local_t *local = frame->local; + int i = (long)cookie; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - } - UNLOCK (&inode->lock); - return ctx; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed to heal lock on child %d for %s", i, + uuid_utoa(local->fd->inode->gfid)); + } + syncbarrier_wake(&local->barrier); + return 0; } -void -afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) +int +afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - GF_ASSERT (inode); - GF_ASSERT (params); + afr_local_t *local = frame->local; + int i = (long)cookie; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int i = 0; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); + } else { + local->cont.lk.getlk_rsp[i] = *lock; + } - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_GET_READ_CTX: - fresh_children = params->u.read_ctx.children; - read_child = (int32_t)(ctx->masks & - AFR_ICTX_READ_CHILD_MASK); - params->u.read_ctx.read_child = read_child; - if (!fresh_children) - goto unlock; - for (i = 0; i < priv->child_count; i++) - fresh_children[i] = ctx->fresh_children[i]; - break; - case AFR_INODE_GET_OPENDIR_DONE: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) - params->u.value = _gf_true; - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); + syncbarrier_wake(&local->barrier); + return 0; } -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode) -{ - afr_inode_ctx_t *ctx = NULL; - gf_boolean_t spb = _gf_false; - - ctx = afr_inode_ctx_get (inode, this); - if (!ctx) - goto out; - if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) - spb = _gf_true; +static gf_boolean_t +afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + afr_local_t *local = frame->local; + struct gf_flock flock = { + 0, + }; + gf_boolean_t ret = _gf_true; + char *wind_on = alloca0(priv->child_count); + unsigned char *success_replies = alloca0(priv->child_count); + local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp), + priv->child_count, gf_afr_mt_gf_lock); + + flock = info->flock; + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + wind_on[i] = 1; + } + + AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock, + info->xdata_req); + + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) == 0) { + ret = _gf_false; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret != 0) + continue; + if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK) + continue; + /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/ + if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner, + &info->lk_owner)) { + ret = _gf_false; + break; + } + } out: - return spb; + afr_local_replies_wipe(local, priv); + GF_FREE(local->cont.lk.getlk_rsp); + local->cont.lk.getlk_rsp = NULL; + return ret; } -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) +static void +afr_mark_fd_bad(fd_t *fd, xlator_t *this) { - afr_inode_params_t params = {0}; + afr_fd_ctx_t *fd_ctx = NULL; - params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.value; + if (!fd) + return; + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + if (fd_ctx) { + fd_ctx->is_fd_bad = _gf_true; + fd_ctx->lk_heal_info = NULL; + } + } + UNLOCK(&fd->lock); } -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) +static void +afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info) { - afr_inode_params_t params = {0}; + LOCK(&priv->lock); + { + list_del(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + UNLOCK(&priv->lock); +} - params.op = AFR_INODE_GET_READ_CTX; - params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.read_ctx.read_child; +static void +afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + int op_errno = 0; + int32_t *current_event_gen = NULL; + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + char *wind_on = alloca0(priv->child_count); + gf_boolean_t retry = _gf_true; + + frame->root->pid = info->pid; + lk_owner_copy(&frame->root->lk_owner, &info->lk_owner); + + op_errno = -afr_dom_lock_acquire(frame); + if ((op_errno != 0)) { + goto release; + } + + if (!afr_does_lk_owner_match(frame, priv, info)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM, + "Ignoring lock heal for %s since lk-onwers mismatch. " + "Lock possibly pre-empted by another client.", + uuid_utoa(info->fd->inode->gfid)); + goto release; + } + + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + continue; + wind_on[i] = 1; + } + + current_event_gen = alloca(priv->child_count); + memcpy(current_event_gen, info->child_up_event_gen, + priv->child_count * sizeof *current_event_gen); + AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd, + &info->flock, info->xdata_req); + + LOCK(&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) { + continue; + } + + if ((current_event_gen[i] == info->child_up_event_gen[i]) && + (current_event_gen[i] > info->child_down_event_gen[i])) { + info->locked_nodes[i] = 1; + retry = _gf_false; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->saved_locks); + } else { + /*We received subsequent child up/down events while heal was in + * progress; don't mark child as healed. Attempt again on the + * new child up*/ + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM, + "Event gen mismatch: skipped healing lock on child %d " + "for %s.", + i, uuid_utoa(info->fd->inode->gfid)); + } + } + } + UNLOCK(&priv->lock); + +release: + afr_dom_lock_release(frame); + if (retry) + afr_add_lock_to_lkhealq(priv, info); + return; } -void -afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) +static int +afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque) { - uint64_t remaining_mask = 0; - uint64_t mask = 0; + STACK_DESTROY(frame->root); + return 0; +} - remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); - mask = (AFR_ICTX_READ_CHILD_MASK & read_child); - ctx->masks = remaining_mask | mask; +static int +afr_lock_heal(void *opaque) +{ + call_frame_t *frame = (call_frame_t *)opaque; + call_frame_t *iter_frame = NULL; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + struct list_head healq = { + 0, + }; + int ret = 0; + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + return ENOMEM; + } + + INIT_LIST_HEAD(&healq); + LOCK(&priv->lock); + { + list_splice_init(&priv->lk_healq, &healq); + } + UNLOCK(&priv->lock); + + list_for_each_entry_safe(info, tmp, &healq, pos) + { + GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) < + priv->child_count)); + ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd); + afr_lock_heal_do(iter_frame, priv, info); + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = ENOTCONN; + gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN, + AFR_MSG_LK_HEAL_DOM, + "Aborting processing of lk_healq." + "Healing will be reattempted on next child up for locks " + "that are still in quorum."); + LOCK(&priv->lock); + { + list_add_tail(&healq, &priv->lk_healq); + } + UNLOCK(&priv->lock); + break; + } + } + + AFR_STACK_DESTROY(iter_frame); + return ret; } -void -afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, - int32_t *fresh_children, int32_t child_count) +static int +__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child) { - int i = 0; + int ret = 0; + call_frame_t *frame = NULL; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; - afr_inode_ctx_set_read_child (ctx, read_child); - for (i = 0; i < child_count; i++) { - if (fresh_children) - ctx->fresh_children[i] = fresh_children[i]; - else - ctx->fresh_children[i] = -1; - } -} + if (priv->shd.iamshd) + return 0; -void -afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, - int32_t child_count) -{ - int i = 0; - int32_t read_child = -1; + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_up_event_gen[child] = priv->event_generation; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } - GF_ASSERT (stale_children); - for (i = 0; i < child_count; i++) { - if (stale_children[i] == -1) - break; - afr_children_rm_child (ctx->fresh_children, - stale_children[i], child_count); - } - read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); - if (!afr_is_child_present (ctx->fresh_children, child_count, - read_child)) - afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); + frame = create_frame(this, this->ctx->pool); + if (!frame) + return -1; + + ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame, + frame); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM, + "Failed to launch lock heal synctask"); + + return ret; } -void -afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) +static int +__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child) { - uint64_t remaining_mask = 0; - uint64_t mask = 0; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_down_event_gen[child] = priv->event_generation; + if (info->locked_nodes[child] == 1) + info->locked_nodes[child] = 0; + if (!afr_has_quorum(info->locked_nodes, this, NULL)) { + /* Since the lock was lost on quorum no. of nodes, we should + * not attempt to heal it anymore. Some other client could have + * acquired the lock, modified data and released it and this + * client wouldn't know about it if we heal it.*/ + afr_mark_fd_bad(info->fd, this); + list_del(&info->pos); + afr_lk_heal_info_cleanup(info); + /* We're not winding an unlock on the node where the lock is still + * present because when fencing logic switches over to the new + * client (since we marked the fd bad), it should preempt any + * existing lock. */ + } + } + return 0; +} - remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); - ctx->masks = remaining_mask | mask; +gf_boolean_t +afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, + int32_t *op_errno) +{ + if (priv->consistent_io && local->call_count != priv->child_count) { + gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN, + "All subvolumes are not up"); + if (op_errno) + *op_errno = ENOTCONN; + return _gf_false; + } + return _gf_true; } -void -afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata) { - GF_ASSERT (inode); - GF_ASSERT (params); + int ret = 0; + uint32_t lk_mode = GF_LK_ADVISORY; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - int32_t *stale_children = NULL; + ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode); + if (!ret && lk_mode == GF_LK_MANDATORY) + return _gf_true; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_SET_READ_CTX: - read_child = params->u.read_ctx.read_child; - fresh_children = params->u.read_ctx.children; - afr_inode_ctx_set_read_ctx (ctx, read_child, - fresh_children, - priv->child_count); - break; - case AFR_INODE_RM_STALE_CHILDREN: - stale_children = params->u.read_ctx.children; - afr_inode_ctx_rm_stale_children (ctx, - stale_children, - priv->child_count); - break; - case AFR_INODE_SET_OPENDIR_DONE: - afr_inode_ctx_set_opendir_done (ctx); - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); + return _gf_false; } -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb) +call_frame_t * +afr_copy_frame(call_frame_t *base) { - afr_inode_ctx_t *ctx = NULL; + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; - ctx = afr_inode_ctx_get (inode, this); - if (mdata_spb != DONT_KNOW) - ctx->mdata_spb = mdata_spb; - if (data_spb != DONT_KNOW) - ctx->data_spb = data_spb; + frame = copy_frame(base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) { + AFR_STACK_DESTROY(frame); + return NULL; + } + + return frame; } -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) -{ - afr_inode_params_t params = {0}; +/* Check if an entry or inode could be undergoing a transaction. */ +gf_boolean_t +afr_is_possibly_under_txn(afr_transaction_type type, afr_local_t *local, + xlator_t *this) +{ + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + GF_UNUSED char *key = NULL; + int keylen = 0; + + priv = this->private; + + if (type == AFR_ENTRY_TRANSACTION) { + key = GLUSTERFS_PARENT_ENTRYLK; + keylen = SLEN(GLUSTERFS_PARENT_ENTRYLK); + } else if (type == AFR_DATA_TRANSACTION) { + /*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once + * pl_inodelk_xattr_fill supports separate keys for different + * domains.*/ + key = GLUSTERFS_INODELK_COUNT; + keylen = SLEN(GLUSTERFS_INODELK_COUNT); + } + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32n(local->replies[i].xdata, key, keylen, &tmp) == 0) + if (tmp) + return _gf_true; + } - params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx_params (this, inode, ¶ms); + return _gf_false; } -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children) +static void +afr_inode_ctx_destroy(afr_inode_ctx_t *ctx) { - afr_inode_params_t params = {0}; - afr_private_t *priv = NULL; + int i = 0; - priv = this->private; - GF_ASSERT (read_child >= 0); - GF_ASSERT (fresh_children); - GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count, - read_child)); + if (!ctx) + return; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + GF_FREE(ctx->pre_op_done[i]); + } - params.op = AFR_INODE_SET_READ_CTX; - params.u.read_ctx.read_child = read_child; - params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx_params (this, inode, ¶ms); + GF_FREE(ctx); } -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children) -{ - afr_inode_params_t params = {0}; +int +__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) +{ + uint64_t ctx_int = 0; + int ret = -1; + int i = -1; + int num_locks = -1; + afr_inode_ctx_t *ictx = NULL; + afr_lock_t *lock = NULL; + afr_private_t *priv = this->private; + + ret = __inode_ctx_get(inode, this, &ctx_int); + if (ret == 0) { + *ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int; + return 0; + } + + ictx = GF_CALLOC(1, sizeof(afr_inode_ctx_t), gf_afr_mt_inode_ctx_t); + if (!ictx) + goto out; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + ictx->pre_op_done[i] = GF_CALLOC(sizeof *ictx->pre_op_done[i], + priv->child_count, gf_afr_mt_int32_t); + if (!ictx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } + + num_locks = sizeof(ictx->lock) / sizeof(afr_lock_t); + for (i = 0; i < num_locks; i++) { + lock = &ictx->lock[i]; + INIT_LIST_HEAD(&lock->post_op); + INIT_LIST_HEAD(&lock->frozen); + INIT_LIST_HEAD(&lock->waiting); + INIT_LIST_HEAD(&lock->owners); + } + + ctx_int = (uint64_t)(uintptr_t)ictx; + ret = __inode_ctx_set(inode, this, &ctx_int); + if (ret) { + goto out; + } + + ictx->spb_choice = -1; + ictx->read_subvol = 0; + ictx->write_subvol = 0; + ictx->lock_count = 0; + ret = 0; + *ctx = ictx; +out: + if (ret) { + afr_inode_ctx_destroy(ictx); + } + return ret; +} + +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. + */ - GF_ASSERT (stale_children); +int +__afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, + inode_t *inode) +{ + int i = 0; + int txn_type = 0; + int count = 0; + int index = -1; + uint16_t datamap_old = 0; + uint16_t metadatamap_old = 0; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint16_t tmp_map = 0; + uint16_t mask = 0; + uint32_t event = 0; + uint64_t val = 0; + afr_private_t *priv = NULL; + + priv = this->private; + txn_type = local->transaction.type; + + if (txn_type == AFR_DATA_TRANSACTION) + val = local->inode_ctx->write_subvol; + else + val = local->inode_ctx->read_subvol; + + metadatamap_old = metadatamap = (val & 0x000000000000ffff); + datamap_old = datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; + + if (txn_type == AFR_DATA_TRANSACTION) + tmp_map = datamap; + else if (txn_type == AFR_METADATA_TRANSACTION) + tmp_map = metadatamap; + + count = gf_bits_count(tmp_map); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) + continue; + + mask = 1 << i; + if (txn_type == AFR_METADATA_TRANSACTION) + metadatamap &= ~mask; + else if (txn_type == AFR_DATA_TRANSACTION) + datamap &= ~mask; + } + + switch (txn_type) { + case AFR_METADATA_TRANSACTION: + if ((metadatamap_old != 0) && (metadatamap == 0) && (count == 1)) { + index = gf_bits_index(tmp_map); + local->transaction.in_flight_sb_errno = local->replies[index] + .op_errno; + local->transaction.in_flight_sb = _gf_true; + metadatamap |= (1 << index); + } + if (metadatamap_old != metadatamap) { + __afr_inode_need_refresh_set(inode, this); + } + break; + + case AFR_DATA_TRANSACTION: + if ((datamap_old != 0) && (datamap == 0) && (count == 1)) { + index = gf_bits_index(tmp_map); + local->transaction.in_flight_sb_errno = local->replies[index] + .op_errno; + local->transaction.in_flight_sb = _gf_true; + datamap |= (1 << index); + } + if (datamap_old != datamap) + __afr_inode_need_refresh_set(inode, this); + break; + + default: + break; + } - params.op = AFR_INODE_RM_STALE_CHILDREN; - params.u.read_ctx.children = stale_children; - afr_inode_set_ctx_params (this, inode, ¶ms); + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); + + if (txn_type == AFR_DATA_TRANSACTION) + local->inode_ctx->write_subvol = val; + local->inode_ctx->read_subvol = val; + + return 0; } gf_boolean_t -afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +afr_is_symmetric_error(call_frame_t *frame, xlator_t *this) { - gf_boolean_t source_xattrs = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; + + priv = this->private; + local = frame->local; - GF_ASSERT (child < child_count); + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least one subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; + break; + } + i_errno = local->replies[i].op_errno; + + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; + break; + } - if ((child >= 0) && (child < child_count) && - sources[child]) { - source_xattrs = _gf_true; + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; } - return source_xattrs; + } + + return matching_errors; } -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child) +int +afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, inode_t *inode) { - gf_boolean_t success_child = _gf_false; - int i = 0; + int ret = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - GF_ASSERT (child < child_count); + priv = this->private; + local = frame->local; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (child == success_children[i]) { - success_child = _gf_true; - break; - } - } - return success_child; + /* If this transaction saw no failures, then exit. */ + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == 0) + return 0; + + if (afr_is_symmetric_error(frame, this)) + return 0; + + LOCK(&inode->lock); + { + ret = __afr_set_in_flight_sb_status(this, local, inode); + } + UNLOCK(&inode->lock); + + return ret; } -gf_boolean_t -afr_is_read_child (int32_t *success_children, int32_t *sources, - int32_t child_count, int32_t child) +int +__afr_inode_read_subvol_get_small(inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) +{ + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; + afr_inode_ctx_t *ctx = NULL; + + priv = this->private; + + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0) + return ret; + + val = ctx->read_subvol; + + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; + + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } + + if (event_p) + *event_p = event; + return ret; +} + +int +__afr_inode_read_subvol_set_small(inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) { - gf_boolean_t success_child = _gf_false; - gf_boolean_t source = _gf_false; + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + int ret = -1; + afr_inode_ctx_t *ctx = NULL; - if (child < 0) { - return _gf_false; - } + priv = this->private; - GF_ASSERT (success_children); - GF_ASSERT (child_count > 0); + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto out; - success_child = afr_is_child_present (success_children, child_count, - child); - if (!success_child) - goto out; - if (NULL == sources) { - source = _gf_true; - goto out; - } - source = afr_is_source_child (sources, child_count, child); + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } + + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); + + ctx->read_subvol = val; + + ret = 0; out: - return (success_child && source); + return ret; } -int32_t -afr_hash_child (int32_t *success_children, int32_t child_count, - unsigned int hmode, uuid_t gfid) +int +__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - uuid_t gfid_copy = {0,}; - pid_t pid; + afr_private_t *priv = NULL; + int ret = -1; - if (!hmode) { - return -1; - } + priv = this->private; - if (gfid) { - uuid_copy(gfid_copy,gfid); - } - if (hmode > 1) { - /* - * Why getpid? Because it's one of the cheapest calls - * available - faster than gethostname etc. - and returns a - * constant-length value that's sure to be shorter than a UUID. - * It's still very unlikely to be the same across clients, so - * it still provides good mixing. We're not trying for - * perfection here. All we need is a low probability that - * multiple clients won't converge on the same subvolume. - */ - pid = getpid(); - memcpy (gfid_copy, &pid, sizeof(pid)); - } + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small(inode, this, data, metadata, + event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; - return SuperFastHash((char *)gfid_copy, - sizeof(gfid_copy)) % child_count; + return ret; } -/* If sources is NULL the xattrs are assumed to be of source for all - * success_children. - */ int -afr_select_read_child_from_policy (int32_t *success_children, - int32_t child_count, int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid) +__afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, + int *spb_choice) { - int32_t read_child = -1; - int i = 0; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - GF_ASSERT (success_children); + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0) + return ret; - read_child = config_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + *spb_choice = ctx->spb_choice; + return 0; +} - read_child = prev_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; +int +__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) +{ + afr_private_t *priv = NULL; + int ret = -1; - read_child = afr_hash_child (success_children, child_count, - hmode, gfid); - if (afr_is_read_child (success_children, sources, child_count, - read_child)) { - goto out; - } + priv = this->private; - for (i = 0; i < child_count; i++) { - read_child = success_children[i]; - if (read_child < 0) - break; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; - } - read_child = -1; + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small(inode, this, data, metadata, + event); + else + ret = -1; -out: - return read_child; + return ret; } -/* This function should be used when all the success_children are sources - */ -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid) -{ - int read_child = -1; - afr_private_t *priv = NULL; - - priv = this->private; - read_child = afr_select_read_child_from_policy (fresh_children, - priv->child_count, - prev_read_child, - config_read_child, - NULL, - priv->hash_mode, gfid); - if (read_child >= 0) - afr_inode_set_read_ctx (this, inode, read_child, - fresh_children); -} - -/* afr_next_call_child () - * This is a common function used by all the read-type fops - * This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child) +int +__afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, + int spb_choice) { - int next_index = 0; - int32_t next_call_child = -1; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - GF_ASSERT (last_index); + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto out; - next_index = *last_index; -retry: - next_index++; - if ((next_index >= child_count) || - (fresh_children[next_index] == -1)) - goto out; - if ((fresh_children[next_index] == read_child) || - (!child_up[fresh_children[next_index]])) - goto retry; - *last_index = next_index; - next_call_child = fresh_children[next_index]; + ctx->spb_choice = spb_choice; + + ret = 0; out: - return next_call_child; + return ret; } - /* This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index) +int +afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - int ret = 0; - afr_private_t *priv = NULL; - int i = 0; + int ret = -1; - GF_ASSERT (child_up); - GF_ASSERT (call_child); - GF_ASSERT (last_index); - GF_ASSERT (fresh_children); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - if (read_child < 0) { - ret = -EIO; - goto out; - } - priv = this->private; - *call_child = -1; - *last_index = -1; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get(inode, this, data, metadata, event_p); + } + UNLOCK(&inode->lock); +out: + return ret; +} - if (child_up[read_child]) { - *call_child = read_child; - } else { - for (i = 0; i < priv->child_count; i++) { - if (fresh_children[i] == -1) - break; - if (child_up[fresh_children[i]]) { - *call_child = fresh_children[i]; - ret = 0; - break; - } - } +int +afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + unsigned char *data = alloca0(priv->child_count); + unsigned char *metadata = alloca0(priv->child_count); + int data_count = 0; + int metadata_count = 0; + int event_generation = 0; + int ret = 0; + + ret = afr_inode_read_subvol_get(inode, this, data, metadata, + &event_generation); + if (ret == -1) + return -EIO; + + data_count = AFR_COUNT(data, priv->child_count); + metadata_count = AFR_COUNT(metadata, priv->child_count); + + if (inode->ia_type == IA_IFDIR) { + /* For directories, allow even if it is in data split-brain. */ + if (type == AFR_METADATA_TRANSACTION || local->op == GF_FOP_STAT || + local->op == GF_FOP_FSTAT) { + if (!metadata_count) + return -EIO; + } + } else { + /* For files, abort in case of data/metadata split-brain. */ + if (!data_count || !metadata_count) { + return -EIO; + } + } + + if (type == AFR_METADATA_TRANSACTION && readable) + memcpy(readable, metadata, priv->child_count * sizeof *metadata); + if (type == AFR_DATA_TRANSACTION && readable) { + if (!data_count) + memcpy(readable, local->child_up, + priv->child_count * sizeof *readable); + else + memcpy(readable, data, priv->child_count * sizeof *data); + } + if (event_p) + *event_p = event_generation; + return 0; +} - if (*call_child == -1) { - ret = -ENOTCONN; - goto out; - } +static int +afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, + int *spb_choice) +{ + int ret = -1; + GF_VALIDATE_OR_GOTO(this->name, inode, out); - *last_index = i; - } + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_get(inode, this, spb_choice); + } + UNLOCK(&inode->lock); out: - gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " - "last_index: %d", ret, *call_child, *last_index); - return ret; + return ret; } -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count) +/* + * frame is used to get the favourite policy. Since + * afr_inode_split_brain_choice_get was called with afr_open, it is possible to + * have a frame with out local->replies. So in that case, frame is passed as + * null, hence this function will handle the frame NULL case. + */ +int +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol) { - unsigned int i = 0; + int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (!xattr) - goto out; - for (i = 0; i < child_count; i++) { - if (xattr[i]) { - dict_unref (xattr[i]); - xattr[i] = NULL; - } + GF_VALIDATE_OR_GOTO("afr", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out); + + priv = this->private; + + ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol); + if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) { + local = frame->local; + *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (*spb_subvol >= 0) { + ret = 0; } + } + out: - return; + return ret; } +int +afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) +{ + int ret = -1; -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set(inode, this, data, metadata, event); + } + UNLOCK(&inode->lock); +out: + return ret; +} + +int +afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice) { - afr_reset_xattr (xattr, child_count); - GF_FREE (xattr); + int ret = -1; + + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_set(inode, this, spb_choice); + } + UNLOCK(&inode->lock); +out: + return ret; } -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +/* The caller of this should perform afr_inode_refresh, if this function + * returns _gf_true + */ +gf_boolean_t +afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1, + int event_gen2) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + gf_boolean_t need_refresh = _gf_false; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - sh = &local->self_heal; - priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, inode, out); - if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) - GF_FREE (sh->data_sh_info); + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto unlock; - if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) - GF_FREE (sh->metadata_sh_info); + need_refresh = ctx->need_refresh; + /* Hoping that the caller will do inode_refresh followed by + * this, hence setting the need_refresh to false */ + ctx->need_refresh = _gf_false; + } +unlock: + UNLOCK(&inode->lock); - GF_FREE (sh->buf); + if (event_gen1 != event_gen2) + need_refresh = _gf_true; +out: + return need_refresh; +} - GF_FREE (sh->parentbufs); +int +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) +{ + int ret = -1; + afr_inode_ctx_t *ctx = NULL; - if (sh->inode) - inode_unref (sh->inode); + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret == 0) { + ctx->need_refresh = _gf_true; + } - afr_xattr_array_destroy (sh->xattr, priv->child_count); + return ret; +} - GF_FREE (sh->child_errno); +int +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) +{ + int ret = -1; - afr_matrix_cleanup (sh->pending_matrix, priv->child_count); - afr_matrix_cleanup (sh->delta_matrix, priv->child_count); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - GF_FREE (sh->sources); + LOCK(&inode->lock); + { + ret = __afr_inode_need_refresh_set(inode, this); + } + UNLOCK(&inode->lock); +out: + return ret; +} - GF_FREE (sh->success); +int +afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - GF_FREE (sh->locked_nodes); + if (!inode) + return ret; - if (sh->healing_fd) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0 || !ctx) { + UNLOCK(&inode->lock); + gf_msg(this->name, GF_LOG_WARNING, 0, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Failed to cancel split-brain choice timer."); + goto out; + } + ctx->spb_choice = -1; + if (ctx->timer) { + gf_timer_call_cancel(this->ctx, ctx->timer); + ctx->timer = NULL; } + ret = 0; + } + UNLOCK(&inode->lock); +out: + return ret; +} - GF_FREE ((char *)sh->linkname); +void +afr_set_split_brain_choice_cbk(void *data) +{ + inode_t *inode = data; + xlator_t *this = THIS; - GF_FREE (sh->success_children); + afr_spb_choice_timeout_cancel(this, inode); + inode_invalidate(inode); + inode_unref(inode); + return; +} - GF_FREE (sh->fresh_children); +int +afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque) +{ + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + inode_t *inode = NULL; + loc_t *loc = NULL; + xlator_t *this = NULL; + afr_spbc_timeout_t *data = opaque; + struct timespec delta = { + 0, + }; + gf_boolean_t timer_set = _gf_false; + gf_boolean_t timer_cancelled = _gf_false; + gf_boolean_t timer_reset = _gf_false; + int old_spb_choice = -1; + + frame = data->frame; + loc = data->loc; + this = frame->this; + priv = this->private; + + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + delta.tv_sec = priv->spb_choice_timeout; + delta.tv_nsec = 0; + + if (!loc->inode) { + ret = -1; + op_errno = EINVAL; + goto out; + } + + if (!(data->d_spb || data->m_spb)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Cannot set " + "replica.split-brain-choice on %s. File is" + " not in data/metadata split-brain.", + uuid_utoa(loc->gfid)); + ret = -1; + op_errno = EINVAL; + goto out; + } + + /* + * we're ref'ing the inode before LOCK like it is done elsewhere in the + * code. If we ref after LOCK, coverity complains of possible deadlocks. + */ + inode = inode_ref(loc->inode); + + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) { + UNLOCK(&inode->lock); + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Failed to get inode_ctx for %s", loc->name); + goto post_unlock; + } - GF_FREE (sh->fresh_parent_dirs); + old_spb_choice = ctx->spb_choice; + ctx->spb_choice = data->spb_child_index; - loc_wipe (&sh->parent_loc); - loc_wipe (&sh->lookup_loc); + /* Possible changes in spb-choice : + * valid to -1 : cancel timer and unref + * valid to valid : cancel timer and inject new one + * -1 to -1 : unref and do not do anything + * -1 to valid : inject timer + */ - GF_FREE (sh->checksum); + /* ctx->timer is NULL iff previous value of + * ctx->spb_choice is -1 + */ + if (ctx->timer) { + if (ctx->spb_choice == -1) { + if (!gf_timer_call_cancel(this->ctx, ctx->timer)) { + ctx->timer = NULL; + timer_cancelled = _gf_true; + } + /* If timer cancel failed here it means that the + * previous cbk will be executed which will set + * spb_choice to -1. So we can consider the + * 'valid to -1' case to be a success + * (i.e. ret = 0) and goto unlock. + */ + goto unlock; + } + goto reset_timer; + } else { + if (ctx->spb_choice == -1) + goto unlock; + goto set_timer; + } - GF_FREE (sh->write_needed); - if (sh->healing_fd) - fd_unref (sh->healing_fd); + reset_timer: + ret = gf_timer_call_cancel(this->ctx, ctx->timer); + if (ret != 0) { + /* We need to bail out now instead of launching a new + * timer. Otherwise the cbk of the previous timer event + * will cancel the new ctx->timer. + */ + ctx->spb_choice = old_spb_choice; + ret = -1; + op_errno = EAGAIN; + goto unlock; + } + ctx->timer = NULL; + timer_reset = _gf_true; + + set_timer: + ctx->timer = gf_timer_call_after(this->ctx, delta, + afr_set_split_brain_choice_cbk, inode); + if (!ctx->timer) { + ctx->spb_choice = old_spb_choice; + ret = -1; + op_errno = ENOMEM; + } + if (!timer_reset && ctx->timer) + timer_set = _gf_true; + if (timer_reset && !ctx->timer) + timer_cancelled = _gf_true; + } +unlock: + UNLOCK(&inode->lock); +post_unlock: + if (!timer_set) + inode_unref(inode); + if (timer_cancelled) + inode_unref(inode); + /* + * We need to invalidate the inode to prevent the kernel from serving + * reads from an older cached value despite a change in spb_choice to + * a new value. + */ + inode_invalidate(inode); +out: + GF_FREE(data); + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + return 0; } - -void -afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) +int +afr_accused_fill(xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) { - afr_private_t *priv = NULL; - int i = 0; + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type(type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; - priv = this->private; + priv = this->private; - afr_matrix_cleanup (local->pending, priv->child_count); - afr_matrix_cleanup (local->transaction.txn_changelog, - priv->child_count); + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr(xdata, priv->pending_key[i], &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy(pending, pending_raw, sizeof(pending)); - GF_FREE (local->internal_lock.locked_nodes); + if (ntoh32(pending[idx])) + accused[i] = 1; + } - for (i = 0; local->internal_lock.inodelk[i].domain; i++) { - GF_FREE (local->internal_lock.inodelk[i].locked_nodes); - } + return 0; +} - GF_FREE (local->internal_lock.lower_locked_nodes); +int +afr_accuse_smallfiles(xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; - afr_entry_lockee_cleanup (&local->internal_lock); + priv = this->private; - GF_FREE (local->transaction.pre_op); - GF_FREE (local->transaction.eager_lock); + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].xdata && + dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE)) + continue; + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } - GF_FREE (local->transaction.basename); - GF_FREE (local->transaction.new_basename); + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (AFR_IS_ARBITER_BRICK(priv, i)) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } - loc_wipe (&local->transaction.parent_loc); - loc_wipe (&local->transaction.new_parent_loc); + return 0; +} - GF_FREE (local->transaction.postop_piggybacked); +int +afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *data_accused, unsigned char *metadata_accused, + unsigned char *data_readable, + unsigned char *metadata_readable, struct afr_reply *replies) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + int i = 0; + int ret = 0; + ia_type_t ia_type = IA_INVAL; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } + if (AFR_IS_ARBITER_BRICK(priv, ARBITER_BRICK_INDEX)) { + data_readable[ARBITER_BRICK_INDEX] = 0; + metadata_readable[ARBITER_BRICK_INDEX] = 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (replies) { /* Lookup */ + if (!replies[i].valid || replies[i].op_ret == -1 || + (replies[i].xdata && + dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + xdata = replies[i].xdata; + ia_type = replies[i].poststat.ia_type; + } else { /* pre-op xattrop */ + xdata = local->transaction.changelog_xdata[i]; + ia_type = inode->ia_type; + } + + if (!xdata) + continue; /* mkdir_cbk sends NULL xdata_rsp. */ + afr_accused_fill(this, xdata, data_accused, + (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION + : AFR_DATA_TRANSACTION); + + afr_accused_fill(this, xdata, metadata_accused, + AFR_METADATA_TRANSACTION); + } + + if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR && + /* We want to accuse small files only when we know for + * sure that there is no IO happening. Otherwise, the + * ia_sizes obtained in post-refresh replies may + * mismatch due to a race between inode-refresh and + * ongoing writes, causing spurious heal launches*/ + !afr_is_possibly_under_txn(AFR_DATA_TRANSACTION, local, this)) { + afr_accuse_smallfiles(this, replies, data_accused); + } + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + return ret; } +int +afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode, + gf_boolean_t *start_heal) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0(priv->child_count); + data_readable = alloca0(priv->child_count); + metadata_accused = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + + ret = afr_readables_fill(frame, this, inode, data_accused, metadata_accused, + data_readable, metadata_readable, replies); + + for (i = 0; i < priv->child_count; i++) { + if (start_heal && priv->child_up[i] && + (data_accused[i] || metadata_accused[i])) { + *start_heal = _gf_true; + break; + } + } + afr_inode_read_subvol_set(inode, this, data_readable, metadata_readable, + event_generation); + return ret; +} -void -afr_local_cleanup (afr_local_t *local, xlator_t *this) +int +afr_refresh_selfheal_done(int ret, call_frame_t *heal, void *opaque) { - afr_private_t * priv = NULL; + if (heal) + AFR_STACK_DESTROY(heal); + return 0; +} - if (!local) - return; +int +afr_inode_refresh_err(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; - afr_local_sh_cleanup (local, this); + local = frame->local; + priv = this->private; - afr_local_transaction_cleanup (local, this); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } - priv = this->private; + err = afr_final_errno(local, priv); +ret: + return err; +} - loc_wipe (&local->loc); - loc_wipe (&local->newloc); +gf_boolean_t +afr_selfheal_enabled(const xlator_t *this) +{ + const afr_private_t *priv = this->private; - if (local->fd) - fd_unref (local->fd); + return priv->data_self_heal || priv->metadata_self_heal || + priv->entry_self_heal; +} - if (local->xattr_req) - dict_unref (local->xattr_req); +int +afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) +{ + call_frame_t *heal_frame = NULL; + afr_local_t *heal_local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; + int event_generation = 0; + int read_subvol = -1; + int ret = 0; + + local = frame->local; + inode = local->inode; + priv = this->private; + + if (err) + goto refresh_done; + + if (local->op == GF_FOP_LOOKUP) + goto refresh_done; + + ret = afr_inode_get_readable(frame, inode, this, local->readable, + &event_generation, local->transaction.type); + + if (ret == -EIO) { + /* No readable subvolume even after refresh ==> splitbrain.*/ + if (!priv->fav_child_policy) { + err = EIO; + goto refresh_done; + } + read_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (read_subvol == -1) { + err = EIO; + goto refresh_done; + } + + heal_frame = afr_frame_create(this, NULL); + if (!heal_frame) { + err = EIO; + goto refresh_done; + } + heal_local = heal_frame->local; + heal_local->xdata_req = dict_new(); + if (!heal_local->xdata_req) { + err = EIO; + AFR_STACK_DESTROY(heal_frame); + goto refresh_done; + } + heal_local->heal_frame = frame; + ret = synctask_new(this->ctx->env, afr_fav_child_reset_sink_xattrs, + afr_fav_child_reset_sink_xattrs_cbk, heal_frame, + heal_frame); + return 0; + } - if (local->dict) - dict_unref (local->dict); +refresh_done: + afr_local_replies_wipe(local, this->private); + local->refreshfn(frame, this, err); - GF_FREE(local->replies); + return 0; +} - GF_FREE (local->child_up); +int +afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error) +{ + call_frame_t *heal_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t start_heal = _gf_false; + afr_local_t *heal_local = NULL; + unsigned char *success_replies = NULL; + int ret = 0; + + if (error != 0) { + goto refresh_done; + } + + local = frame->local; + priv = this->private; + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + + if (priv->thin_arbiter_count && local->is_read_txn && + AFR_COUNT(success_replies, priv->child_count) != priv->child_count) { + /* We need to query the good bricks and/or thin-arbiter.*/ + if (success_replies[0]) { + local->read_txn_query_child = AFR_CHILD_ZERO; + } else if (success_replies[1]) { + local->read_txn_query_child = AFR_CHILD_ONE; + } + error = EINVAL; + goto refresh_done; + } + + if (!afr_has_quorum(success_replies, this, frame)) { + error = afr_final_errno(frame->local, this->private); + if (!error) + error = afr_quorum_errno(priv); + goto refresh_done; + } + + ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal); + + if (ret && afr_selfheal_enabled(this) && start_heal) { + heal_frame = afr_frame_create(this, NULL); + if (!heal_frame) + goto refresh_done; + heal_local = heal_frame->local; + heal_local->refreshinode = inode_ref(local->refreshinode); + heal_local->heal_frame = heal_frame; + if (!afr_throttled_selfheal(heal_frame, this)) { + AFR_STACK_DESTROY(heal_frame); + goto refresh_done; + } + } + +refresh_done: + afr_txn_refresh_done(frame, this, error); + + return 0; +} - GF_FREE (local->child_errno); +void +afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + dict_t *xdata, struct iatt *par) +{ + afr_local_t *local = NULL; + int call_child = (long)cookie; + int8_t need_heal = 1; + int call_count = 0; + int ret = 0; + + local = frame->local; + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + if (par) + local->replies[call_child].postparent = *par; + if (xdata) + local->replies[call_child].xdata = dict_ref(xdata); + } + + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to get link count"); + } + } - GF_FREE (local->fresh_children); + local->replies[call_child].need_heal = need_heal; + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + ret = afr_inode_refresh_err(frame, this); + if (ret) { + gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed"); + } + afr_inode_refresh_done(frame, this, ret); + } +} - { /* lookup */ - if (local->cont.lookup.xattrs) { - afr_reset_xattr (local->cont.lookup.xattrs, - priv->child_count); - GF_FREE (local->cont.lookup.xattrs); - local->cont.lookup.xattrs = NULL; - } +int +afr_inode_refresh_subvol_with_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, + int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *par) +{ + afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf, + xdata, par); + return 0; +} - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - } +int +afr_inode_refresh_subvol_with_lookup(call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, uuid_t gfid, dict_t *xdata) +{ + loc_t loc = { + 0, + }; + afr_private_t *priv = NULL; - if (local->cont.lookup.inode) { - inode_unref (local->cont.lookup.inode); - } + priv = this->private; - GF_FREE (local->cont.lookup.postparents); + loc.inode = inode; + if (gf_uuid_is_null(inode->gfid) && gfid) { + /* To handle setattr/setxattr on yet to be linked inode from + * dht */ + gf_uuid_copy(loc.gfid, gfid); + } else { + gf_uuid_copy(loc.gfid, inode->gfid); + } - GF_FREE (local->cont.lookup.bufs); + STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_lookup_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; +} - GF_FREE (local->cont.lookup.success_children); +int +afr_inode_refresh_subvol_with_fstat_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf, + xdata, NULL); + return 0; +} - GF_FREE (local->cont.lookup.sources); - afr_matrix_cleanup (local->cont.lookup.pending_matrix, - priv->child_count); - } +int +afr_inode_refresh_subvol_with_fstat(call_frame_t *frame, xlator_t *this, int i, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - { /* getxattr */ - GF_FREE (local->cont.getxattr.name); - } + priv = this->private; + local = frame->local; - { /* lk */ - GF_FREE (local->cont.lk.locked_nodes); - } + STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_fstat_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->fstat, local->fd, xdata); + return 0; +} - { /* create */ - if (local->cont.create.fd) - fd_unref (local->cont.create.fd); - if (local->cont.create.params) - dict_unref (local->cont.create.params); - } +int +afr_inode_refresh_do(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + int ret = 0; + dict_t *xdata = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *wind_subvols = NULL; - { /* mknod */ - if (local->cont.mknod.params) - dict_unref (local->cont.mknod.params); - } + priv = this->private; + local = frame->local; + wind_subvols = alloca0(priv->child_count); - { /* mkdir */ - if (local->cont.mkdir.params) - dict_unref (local->cont.mkdir.params); - } + afr_local_replies_wipe(local, priv); - { /* symlink */ - if (local->cont.symlink.params) - dict_unref (local->cont.symlink.params); + if (local->fd) { + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + afr_inode_refresh_done(frame, this, EINVAL); + return 0; } + } - { /* writev */ - GF_FREE (local->cont.writev.vector); - } + xdata = dict_new(); + if (!xdata) { + afr_inode_refresh_done(frame, this, ENOMEM); + return 0; + } - { /* setxattr */ - if (local->cont.setxattr.dict) - dict_unref (local->cont.setxattr.dict); - } + ret = afr_xattr_req_prepare(this, xdata); + if (ret != 0) { + dict_unref(xdata); + afr_inode_refresh_done(frame, this, -ret); + return 0; + } - { /* fsetxattr */ - if (local->cont.fsetxattr.dict) - dict_unref (local->cont.fsetxattr.dict); - } + ret = dict_set_sizen_str_sizen(xdata, "link-count", GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } - { /* removexattr */ - GF_FREE (local->cont.removexattr.name); - } - { /* xattrop */ - if (local->cont.xattrop.xattr) - dict_unref (local->cont.xattrop.xattr); - } - { /* fxattrop */ - if (local->cont.fxattrop.xattr) - dict_unref (local->cont.fxattrop.xattr); - } - { /* symlink */ - GF_FREE (local->cont.symlink.linkpath); - } + ret = dict_set_str_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name); + if (ret) { + gf_msg_debug(this->name, -ret, + "Unable to set inodelk-dom-count in dict "); + } - { /* opendir */ - GF_FREE (local->cont.opendir.checksum); + if (local->fd) { + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] && fd_ctx->opened_on[i] == AFR_FD_OPENED) + wind_subvols[i] = 1; } + } else { + memcpy(wind_subvols, local->child_up, + sizeof(*local->child_up) * priv->child_count); + } - { /* readdirp */ - if (local->cont.readdir.dict) - dict_unref (local->cont.readdir.dict); - } + local->call_count = AFR_COUNT(wind_subvols, priv->child_count); - if (local->xdata_req) - dict_unref (local->xdata_req); + call_count = local->call_count; + if (!call_count) { + dict_unref(xdata); + if (local->fd && AFR_COUNT(local->child_up, priv->child_count)) + afr_inode_refresh_done(frame, this, EBADFD); + else + afr_inode_refresh_done(frame, this, ENOTCONN); + return 0; + } + for (i = 0; i < priv->child_count; i++) { + if (!wind_subvols[i]) + continue; - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); -} + if (local->fd) + afr_inode_refresh_subvol_with_fstat(frame, this, i, xdata); + else + afr_inode_refresh_subvol_with_lookup( + frame, this, i, local->refreshinode, local->refreshgfid, xdata); + + if (!--call_count) + break; + } + dict_unref(xdata); + + return 0; +} int -afr_frame_return (call_frame_t *frame) +afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, afr_inode_refresh_cbk_t refreshfn) { - afr_local_t *local = NULL; - int call_count = 0; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + local->refreshfn = refreshfn; + + if (local->refreshinode) { + inode_unref(local->refreshinode); + local->refreshinode = NULL; + } - return call_count; + local->refreshinode = inode_ref(inode); + + if (gfid) + gf_uuid_copy(local->refreshgfid, gfid); + else + gf_uuid_clear(local->refreshgfid); + + afr_inode_refresh_do(frame, this); + + return 0; } int -afr_set_elem_count_get (unsigned char *elems, int child_count) +afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req) { - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) - if (elems[i]) - ret++; - return ret; -} + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; -/** - * up_children_count - return the number of children that are up - */ + priv = this->private; -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count) -{ - return afr_set_elem_count_get (child_up, child_count); + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64(xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Unable to set dict value for %s", priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_uint64(xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_msg_debug(this->name, -ret, + "failed to set dirty " + "query flag"); + } + + ret = dict_set_int32_sizen(xattr_req, "list-xattr", 1); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set list-xattr in dict "); + } + + return ret; } -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count) -{ - return afr_set_elem_count_get (children, child_count); +int +afr_lookup_xattr_req_prepare(afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) +{ + int ret = -ENOMEM; + + if (!local->xattr_req) + local->xattr_req = dict_new(); + + if (!local->xattr_req) + goto out; + + if (xattr_req && (xattr_req != local->xattr_req)) + dict_copy(xattr_req, local->xattr_req); + + ret = afr_xattr_req_prepare(this, local->xattr_req); + + ret = dict_set_uint64(local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64(local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_ENTRYLK_COUNT); + } + + ret = dict_set_uint32(local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_PARENT_ENTRYLK); + } + + ret = dict_set_sizen_str_sizen(local->xattr_req, "link-count", + GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } + + ret = 0; +out: + return ret; } -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count) +int +afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable) { - return afr_set_elem_count_get (pre_op, child_count); + int i = 0; + int child = -1; + int64_t read_iter = -1; + int64_t pending_read = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i]) + continue; + read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); + if (child == -1 || read_iter < pending_read) { + pending_read = read_iter; + child = i; + } + } + + return child; } -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this) +static int32_t +afr_least_latency_child(afr_private_t *priv, unsigned char *readable) { - uint64_t ctx = 0; - int32_t ret = 0; + int32_t i = 0; + int child = -1; - GF_ASSERT (loc); - GF_ASSERT (this); - GF_ASSERT (loc->inode); + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; - ret = inode_ctx_get (loc->inode, this, &ctx); - if (0 == ret) - return _gf_false; - return _gf_true; + if (child == -1 || + priv->child_latency[i] < priv->child_latency[child]) { + child = i; + } + } + return child; } -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) +static int32_t +afr_least_latency_times_pending_reads_child(afr_private_t *priv, + unsigned char *readable) { - GF_ASSERT (loc); - GF_ASSERT (buf); + int32_t i = 0; + int child = -1; + int64_t pending_read = 0; + int64_t latency = -1; + int64_t least_latency = -1; - uuid_copy (loc->gfid, buf->ia_gfid); - if (postparent) - uuid_copy (loc->pargfid, postparent->ia_gfid); -} + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; -int -afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) -{ - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - dict_t **xattr = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int ret = 0; - int i = 0; - - GF_ASSERT (local); - - buf = &local->cont.lookup.buf; - postparent = &local->cont.lookup.postparent; - xattr = &local->cont.lookup.xattr; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, - local->fresh_children); - if (read_child < 0) { - ret = -1; - goto out; - } - success_children = local->cont.lookup.success_children; - sources = local->cont.lookup.sources; - memset (sources, 0, sizeof (*sources) * priv->child_count); - afr_children_intersection_get (local->fresh_children, success_children, - sources, priv->child_count); - if (!sources[read_child]) { - read_child = -1; - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - read_child = i; - break; - } - } - } - if (read_child < 0) { - ret = -1; - goto out; - } - gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", - read_child); - if (!*xattr) - *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); - *buf = local->cont.lookup.bufs[read_child]; - *postparent = local->cont.lookup.postparents[read_child]; + pending_read = GF_ATOMIC_GET(priv->pending_reads[i]); + latency = (pending_read + 1) * priv->child_latency[i]; - if (IA_INVAL == local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = buf->ia_type; + if (child == -1 || latency < least_latency) { + least_latency = latency; + child = i; } -out: - return ret; + } + return child; } -static void -afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) -{ - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; - int ret = -1; - uint32_t parent_entrylk = 0; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - GF_ASSERT (child_index >= 0); - - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; - - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; - ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, - &parent_entrylk); - if (!ret) - local->cont.lookup.parent_entrylk += parent_entrylk; +int +afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, + unsigned char *readable) +{ + uuid_t gfid_copy = { + 0, + }; + pid_t pid; + int child = -1; + + switch (priv->hash_mode) { + case AFR_READ_POLICY_FIRST_UP: + break; + case AFR_READ_POLICY_GFID_HASH: + gf_uuid_copy(gfid_copy, args->gfid); + child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % + priv->child_count; + break; + case AFR_READ_POLICY_GFID_PID_HASH: + if (args->ia_type != IA_IFDIR) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and + * returns a constant-length value that's sure to be + * shorter than a UUID. It's still very unlikely to be + * the same across clients, so it still provides good + * mixing. We're not trying for perfection here. All we + * need is a low probability that multiple clients + * won't converge on the same subvolume. + */ + gf_uuid_copy(gfid_copy, args->gfid); + pid = getpid(); + *(pid_t *)gfid_copy ^= pid; + } + child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % + priv->child_count; + break; + case AFR_READ_POLICY_LESS_LOAD: + child = afr_least_pending_reads_child(priv, readable); + break; + case AFR_READ_POLICY_LEAST_LATENCY: + child = afr_least_latency_child(priv, readable); + break; + case AFR_READ_POLICY_LOAD_LATENCY_HYBRID: + child = afr_least_latency_times_pending_reads_child(priv, readable); + break; + } + + return child; } -/* - * It's important to maintain a commutative property on do_*_self_heal and - * found*; once set, they must not be cleared by a subsequent iteration or - * call, so that they represent a logical OR of all iterations and calls - * regardless of child/key order. That allows the caller to call us multiple - * times without having to use a separate variable as a "reduce" accumulator. - */ -static void -afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, - dict_t *xattr) +int +afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, + unsigned char *readable, + afr_read_subvol_args_t *args) { - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - void *pending_raw = NULL; - int32_t *pending = NULL; + int i = 0; + int read_subvol = -1; + afr_private_t *priv = NULL; + afr_read_subvol_args_t local_args = { + 0, + }; - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); + priv = this->private; - priv = this->private; + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - if (ret != 0) { - continue; - } - pending = pending_raw; + if (inode_is_linked(inode)) { + gf_uuid_copy(local_args.gfid, inode->gfid); + local_args.ia_type = inode->ia_type; + } else if (args) { + local_args = *args; + } - if (pending[AFR_METADATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } + /* second preference - use hashed mode */ + read_subvol = afr_hash_child(&local_args, priv, readable); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; - if (pending[AFR_ENTRY_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_entry_self_heal = _gf_true; - } + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } - if (pending[AFR_DATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } - } + /* no readable subvolumes, either split brain or all subvols down */ + + return -1; } -void -afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) +int +afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type) { - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t subvol_status = 0; - int32_t *success_children = NULL; - dict_t **xattrs = NULL; - struct iatt *bufs = NULL; - int32_t **pending_matrix = NULL; + int ret = -1; - priv = this->private; - - sources = GF_CALLOC (priv->child_count, sizeof (*sources), - gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - success_children = local->cont.lookup.success_children; - xattrs = local->cont.lookup.xattrs; - bufs = local->cont.lookup.bufs; - pending_matrix = local->cont.lookup.pending_matrix; - afr_build_sources (this, xattrs, bufs, pending_matrix, - sources, success_children, AFR_METADATA_TRANSACTION, - &subvol_status, _gf_false); - if (subvol_status & SPLIT_BRAIN) - local->cont.lookup.possible_spb = _gf_true; -out: - GF_FREE (sources); + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get(inode, this, 0, readable, event_p); + else + ret = afr_inode_read_subvol_get(inode, this, readable, 0, event_p); + return ret; } -static void -afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, - struct iatt *buf, struct iatt *lookup_buf) +void +afr_readables_intersect_get(inode_t *inode, xlator_t *this, int *event, + unsigned char *intersection) { - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - gf_log (this->name, GF_LOG_DEBUG, - "permissions differ for %s ", local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *intersect = NULL; - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->self_heal.do_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "ownership differs for %s ", local->loc.path); - } + priv = this->private; + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + intersect = alloca0(priv->child_count); - if (SIZE_DIFFERS (buf, lookup_buf) - && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, - "size differs for %s ", local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } + afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable, + event); - if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { - /* mismatching gfid */ - gf_log (this->name, GF_LOG_DEBUG, - "%s: gfid different on subvolume", local->loc.path); - } + AFR_INTERSECT(intersect, data_readable, metadata_readable, + priv->child_count); + if (intersection) + memcpy(intersection, intersect, + sizeof(*intersection) * priv->child_count); } -static void -afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) +int +afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p, + unsigned char *readables, int *event_p, + afr_transaction_type type, afr_read_subvol_args_t *args) { - gf_boolean_t split_brain = _gf_false; - afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; - sh = &local->self_heal; + priv = this->private; - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - split_brain = split_brain || local->cont.lookup.possible_spb; - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - sh->force_confirm_spb = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "split brain detected during lookup of %s.", - local->loc.path); - } -} + readable = alloca0(priv->child_count); + intersection = alloca0(priv->child_count); -static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) -{ - GF_ASSERT (local); - GF_ASSERT (this); + afr_inode_read_subvol_type_get(inode, this, readable, &event, type); - if ((local->success_count > 0) && (local->enoent_count > 0)) { - local->self_heal.do_metadata_self_heal = _gf_true; - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_entry_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entries are missing in lookup of %s.", - local->loc.path); - } + afr_readables_intersect_get(inode, this, &event, intersection); - return; + if (AFR_COUNT(intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy(inode, this, intersection, + args); + else + subvol = afr_read_subvol_select_by_policy(inode, this, readable, args); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + if (readables) + memcpy(readables, readable, sizeof(*readables) * priv->child_count); + return subvol; } -gf_boolean_t -afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +void +afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this) { - GF_ASSERT (sh); - GF_ASSERT (priv); + afr_private_t *priv = NULL; + int i = 0; - if (sh->force_confirm_spb) - return _gf_true; - return (sh->do_gfid_self_heal - || sh->do_missing_entry_self_heal - || (afr_data_self_heal_enabled (priv->data_self_heal) && - sh->do_data_self_heal) - || (priv->metadata_self_heal && sh->do_metadata_self_heal) - || (priv->entry_self_heal && sh->do_entry_self_heal)); -} + priv = this->private; -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type) -{ - afr_transaction_type type = AFR_METADATA_TRANSACTION; + afr_matrix_cleanup(local->pending, priv->child_count); - GF_ASSERT (ia_type != IA_INVAL); + GF_FREE(local->internal_lock.lower_locked_nodes); - if (IA_ISDIR (ia_type)) { - type = AFR_ENTRY_TRANSACTION; - } else if (IA_ISREG (ia_type)) { - type = AFR_DATA_TRANSACTION; - } - return type; -} + afr_lockees_cleanup(&local->internal_lock); -int -afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, - int32_t *read_child) -{ - ia_type_t ia_type = IA_INVAL; - int32_t source = -1; - int ret = -1; - dict_t **xattrs = NULL; - int32_t *success_children = NULL; - afr_transaction_type type = AFR_METADATA_TRANSACTION; - uuid_t *gfid = NULL; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (local->success_count > 0); - - success_children = local->cont.lookup.success_children; - /*We can take the success_children[0] only because we already - *handle the conflicting children other wise, we could select the - *read_child based on wrong file type - */ - ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; - type = afr_transaction_type_get (ia_type); - xattrs = local->cont.lookup.xattrs; - gfid = &local->cont.lookup.buf.ia_gfid; - source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type, *gfid); - if (source < 0) { - gf_log (this->name, GF_LOG_DEBUG, "failed to select source " - "for %s", local->loc.path); - goto out; + GF_FREE(local->transaction.pre_op); + + GF_FREE(local->transaction.pre_op_sources); + if (local->transaction.changelog_xdata) { + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.changelog_xdata[i]) + continue; + dict_unref(local->transaction.changelog_xdata[i]); } + GF_FREE(local->transaction.changelog_xdata); + } - gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", - source, local->loc.path); - *read_child = source; - ret = 0; -out: - return ret; -} + GF_FREE(local->transaction.failed_subvols); -static inline gf_boolean_t -afr_is_transaction_running (afr_local_t *local) -{ - GF_ASSERT (local->fop == GF_FOP_LOOKUP); - return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); + GF_FREE(local->transaction.basename); + GF_FREE(local->transaction.new_basename); + + loc_wipe(&local->transaction.parent_loc); + loc_wipe(&local->transaction.new_parent_loc); } void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)) -{ - afr_local_t *local = NULL; - char sh_type_str[256] = {0,}; - char *bg = ""; - - GF_ASSERT (frame); - GF_ASSERT (this); - GF_ASSERT (inode); - GF_ASSERT (ia_type != IA_INVAL); - - local = frame->local; - local->self_heal.background = background; - local->self_heal.type = ia_type; - local->self_heal.unwind = unwind; - local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk; - - afr_self_heal_type_str_get (&local->self_heal, - sh_type_str, - sizeof (sh_type_str)); - - if (background) - bg = "background"; - gf_log (this->name, GF_LOG_DEBUG, - "%s %s self-heal triggered. path: %s, reason: %s", bg, - sh_type_str, local->loc.path, reason); - - afr_self_heal (frame, this, inode); -} - -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, - struct iatt *bufs, unsigned int child_count, - const char *path) -{ - unsigned int gfid_miss_count = 0; - int i = 0; - struct iatt *child1 = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if (uuid_is_null (child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" - " on subvolume %d", path, success_children[i]); - gfid_miss_count++; - } - } +afr_reply_wipe(struct afr_reply *reply) +{ + if (reply->xdata) { + dict_unref(reply->xdata); + reply->xdata = NULL; + } - return gfid_miss_count; + if (reply->xattr) { + dict_unref(reply->xattr); + reply->xattr = NULL; + } } -static int -afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) +void +afr_replies_wipe(struct afr_reply *replies, int count) { - int32_t *success_children = NULL; - afr_private_t *priv = NULL; - struct iatt *bufs = NULL; - int miss_count = 0; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; + int i = 0; - miss_count = afr_gfid_missing_count (this->name, success_children, - bufs, priv->child_count, - local->loc.path); - return miss_count; -} - -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name) -{ - gf_boolean_t conflicting = _gf_false; - int i = 0; - struct iatt *child1 = NULL; - struct iatt *child2 = NULL; - uuid_t *gfid = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) - gfid = &child1->ia_gfid; - - if (i == 0) - continue; - - child2 = &bufs[success_children[i-1]]; - if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " - "differs on subvolumes (%d, %d)", path, - success_children[i-1], success_children[i]); - conflicting = _gf_true; - goto out; - } - if (!gfid || uuid_is_null (child1->ia_gfid)) - continue; - if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" - " on subvolume %d", path, success_children[i]); - conflicting = _gf_true; - goto out; - } - } -out: - return conflicting; + for (i = 0; i < count; i++) { + afr_reply_wipe(&replies[i]); + } } -/* afr_update_gfid_from_iatts: This function should be called only if the - * iatts are not conflicting. - */ void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, unsigned int child_count) -{ - uuid_t *gfid = NULL; - int i = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { - gfid = &bufs[child].ia_gfid; - } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { - if (uuid_compare (*gfid, bufs[child].ia_gfid)) { - GF_ASSERT (0); - goto out; - } - } - } - if (gfid && (!uuid_is_null (*gfid))) - uuid_copy (uuid, *gfid); -out: +afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv) +{ + if (!local->replies) return; -} -static gf_boolean_t -afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) -{ - afr_private_t *priv = NULL; - gf_boolean_t conflict = _gf_false; + afr_replies_wipe(local->replies, priv->child_count); - priv = this->private; - conflict = afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name); - return conflict; + memset(local->replies, 0, sizeof(*local->replies) * priv->child_count); } -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal) -{ - return !strcmp (data_self_heal, "open"); +static gf_boolean_t +afr_fop_lock_is_unlock(call_frame_t *frame) +{ + afr_local_t *local = frame->local; + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + if ((F_UNLCK == local->cont.inodelk.in_flock.l_type) && + (local->cont.inodelk.in_cmd == F_SETLKW || + local->cont.inodelk.in_cmd == F_SETLK)) + return _gf_true; + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + if (ENTRYLK_UNLOCK == local->cont.entrylk.in_cmd) + return _gf_true; + break; + default: + return _gf_false; + } + return _gf_false; } -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal) +static gf_boolean_t +afr_lk_is_unlock(int32_t cmd, struct gf_flock *flock) { - gf_boolean_t enabled = _gf_false; + switch (cmd) { + case F_RESLK_UNLCK: + return _gf_true; + break; - if (gf_string2boolean (data_self_heal, &enabled) == -1) { - enabled = !strcmp (data_self_heal, "open"); - GF_ASSERT (enabled); - } +#if F_SETLKW != F_SETLKW64 + case F_SETLKW64: +#endif + case F_SETLKW: - return enabled; +#if F_SETLK != F_SETLK64 + case F_SETLK64: +#endif + case F_SETLK: + if (F_UNLCK == flock->l_type) + return _gf_true; + break; + default: + return _gf_false; + } + return _gf_false; } -static void -afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) +void +afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret, + int32_t *op_errno) { - int i = 0; - struct iatt *bufs = NULL; - dict_t **xattr = NULL; - afr_private_t *priv = NULL; - int32_t child1 = -1; - int32_t child2 = -1; - afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - priv = this->private; - sh = &local->self_heal; + if (!frame || !frame->this || !frame->local || !frame->this->private) + return; - afr_detect_self_heal_by_lookup_status (local, this); + if (*op_ret < 0) + return; - if (afr_lookup_gfid_missing_count (local, this)) - local->self_heal.do_gfid_self_heal = _gf_true; + /* Failing inodelk/entrylk/lk here is not a good idea because we + * need to cleanup the locks on the other bricks if we choose to fail + * the fop here. The brick may go down just after unwind happens as well + * so anyways the fop will fail when the next fop is sent so leaving + * it like this for now.*/ + local = frame->local; + switch (local->op) { + case GF_FOP_LOOKUP: + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + case GF_FOP_LK: + return; + default: + break; + } - if (_gf_true == afr_lookup_conflicting_entries (local, this)) - local->self_heal.do_missing_entry_self_heal = _gf_true; - else - afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, - local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count); + priv = frame->this->private; + if (!priv->consistent_io) + return; - bufs = local->cont.lookup.bufs; - for (i = 1; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i-1]; - child2 = local->cont.lookup.success_children[i]; - afr_detect_self_heal_by_iatt (local, this, - &bufs[child1], &bufs[child2]); - } + if (local->event_generation && + (local->event_generation != priv->event_generation)) + goto inconsistent; - xattr = local->cont.lookup.xattrs; - for (i = 0; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i]; - afr_lookup_set_self_heal_params_by_xattr (local, this, - xattr[child1]); - } - if (afr_open_only_data_self_heal (priv->data_self_heal)) - sh->do_data_self_heal = _gf_false; - if (sh->do_metadata_self_heal) - afr_lookup_check_set_metadata_split_brain (local, this); - afr_detect_self_heal_by_split_brain_status (local, this); + return; +inconsistent: + *op_ret = -1; + *op_errno = ENOTCONN; } -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed) +void +afr_local_cleanup(afr_local_t *local, xlator_t *this) { - afr_local_t *local = NULL; - int ret = -1; - dict_t *xattr = NULL; + afr_private_t *priv = NULL; - local = frame->local; + if (!local) + return; - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = afr_most_important_error(local->op_errno, - op_errno, _gf_true); + syncbarrier_destroy(&local->barrier); - goto out; - } else { - local->op_ret = 0; - } - - afr_lookup_done_success_action (frame, this, _gf_true); - xattr = local->cont.lookup.xattr; - if (xattr) { - ret = dict_set_int32 (xattr, "sh-failed", sh_failed); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "sh-failed to %d", local->loc.path, sh_failed); - - if (local->self_heal.actual_sh_started == _gf_true && - sh_failed == 0) { - ret = dict_set_int32 (xattr, "actual-sh-done", 1); - if (ret) - gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" - " set actual-sh-done to %d", - local->loc.path, - local->self_heal.actual_sh_started); - } - } -out: - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + afr_local_transaction_cleanup(local, this); - return 0; -} + priv = this->private; -//TODO: At the moment only lookup needs this, so not doing any checks, in the -// future we will have to do fop specific operations -void -afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_local_t *sh_local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - struct iatt *lookup_bufs = NULL; - struct iatt *lookup_parentbufs = NULL; + loc_wipe(&local->loc); + loc_wipe(&local->newloc); - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - local = sh->orig_frame->local; - lookup_bufs = local->cont.lookup.bufs; - lookup_parentbufs = local->cont.lookup.postparents; - priv = this->private; + if (local->fd) + fd_unref(local->fd); - memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); - memcpy (lookup_parentbufs, sh->parentbufs, - priv->child_count * sizeof (*sh->parentbufs)); + if (local->xattr_req) + dict_unref(local->xattr_req); - afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - local->cont.lookup.xattr = NULL; - } + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); - } + if (local->dict) + dict_unref(local->dict); - afr_reset_children (local->cont.lookup.success_children, - priv->child_count); - afr_children_copy (local->cont.lookup.success_children, - sh->fresh_children, priv->child_count); -} + afr_local_replies_wipe(local, priv); + GF_FREE(local->replies); -static void -afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, - gf_boolean_t *sh_launched) -{ - unsigned int up_count = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - char *reason = NULL; - - GF_ASSERT (sh_launched); - *sh_launched = _gf_false; - priv = this->private; - local = frame->local; - - up_count = afr_up_children_count (local->child_up, priv->child_count); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); - goto out; - } + GF_FREE(local->child_up); - afr_lookup_set_self_heal_params (local, this); - if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local)) - goto out; + GF_FREE(local->read_attempted); + + GF_FREE(local->readable); + GF_FREE(local->readable2); + + if (local->inode) + inode_unref(local->inode); - reason = "lookup detected pending operations"; - afr_launch_self_heal (frame, this, local->cont.lookup.inode, - _gf_true, local->cont.lookup.buf.ia_type, - reason, afr_post_gfid_sh_success, - afr_self_heal_lookup_unwind); - *sh_launched = _gf_true; - } -out: - return; -} + if (local->parent) + inode_unref(local->parent); -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *fresh_children, unsigned int child_count) -{ - unsigned int i = 0; - unsigned int j = 0; - - GF_ASSERT (success_children); - GF_ASSERT (sources); - GF_ASSERT (fresh_children); - - afr_reset_children (fresh_children, child_count); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (afr_is_read_child (success_children, sources, child_count, - success_children[i])) { - fresh_children[j] = success_children[i]; - j++; - } - } -} + if (local->parent2) + inode_unref(local->parent2); -static int -afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) -{ - afr_private_t *priv = NULL; + if (local->refreshinode) + inode_unref(local->refreshinode); - GF_ASSERT (read_child >= 0); + { /* getxattr */ + GF_FREE(local->cont.getxattr.name); + } - priv = this->private; - afr_get_fresh_children (local->cont.lookup.success_children, - local->cont.lookup.sources, - local->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, - local->fresh_children); + { /* lk */ + GF_FREE(local->cont.lk.locked_nodes); + GF_FREE(local->cont.lk.dom_locked_nodes); + GF_FREE(local->cont.lk.dom_lock_op_ret); + GF_FREE(local->cont.lk.dom_lock_op_errno); + } - return 0; -} + { /* create */ + if (local->cont.create.fd) + fd_unref(local->cont.create.fd); + if (local->cont.create.params) + dict_unref(local->cont.create.params); + } -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict) -{ - int32_t read_child = -1; - int32_t ret = -1; - afr_local_t *local = NULL; - gf_boolean_t fresh_lookup = _gf_false; + { /* mknod */ + if (local->cont.mknod.params) + dict_unref(local->cont.mknod.params); + } - local = frame->local; - fresh_lookup = local->cont.lookup.fresh_lookup; + { /* mkdir */ + if (local->cont.mkdir.params) + dict_unref(local->cont.mkdir.params); + } - if (local->loc.parent == NULL) - fail_conflict = _gf_true; + { /* symlink */ + if (local->cont.symlink.params) + dict_unref(local->cont.symlink.params); + } - if (afr_lookup_conflicting_entries (local, this)) { - if (fail_conflict == _gf_false) - ret = 0; - goto out; - } + { /* writev */ + GF_FREE(local->cont.writev.vector); + if (local->cont.writev.iobref) + iobref_unref(local->cont.writev.iobref); + } - ret = afr_lookup_select_read_child (local, this, &read_child); - if (!afr_is_transaction_running (local) || fresh_lookup) { - if (read_child < 0) - goto out; + { /* setxattr */ + if (local->cont.setxattr.dict) + dict_unref(local->cont.setxattr.dict); + } - ret = afr_lookup_set_read_ctx (local, this, read_child); - if (ret) - goto out; - } + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref(local->cont.fsetxattr.dict); + } - ret = afr_lookup_build_response_params (local, this); - if (ret) - goto out; - afr_update_loc_gfids (&local->loc, - &local->cont.lookup.buf, - &local->cont.lookup.postparent); + { /* removexattr */ + GF_FREE(local->cont.removexattr.name); + } + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref(local->cont.xattrop.xattr); + } + { /* symlink */ + GF_FREE(local->cont.symlink.linkpath); + } - ret = 0; -out: - if (ret) { - local->op_ret = -1; - local->op_errno = EIO; - } - return ret; + { /* opendir */ + GF_FREE(local->cont.opendir.checksum); + } + + { /* open */ + if (local->cont.open.fd) + fd_unref(local->cont.open.fd); + } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref(local->cont.readdir.dict); + } + + { /* inodelk */ + GF_FREE(local->cont.inodelk.volume); + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + } + + { /* entrylk */ + GF_FREE(local->cont.entrylk.volume); + GF_FREE(local->cont.entrylk.basename); + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + } + + if (local->xdata_req) + dict_unref(local->xdata_req); + + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); } int -afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) -{ - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; - int lsubvol = -1; - - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if (uuid_is_null (bufs[child].ia_gfid)) - continue; - if (lsubvol < 0) { - lsubvol = child; - } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { - lsubvol = child; - } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && - (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { - lsubvol = child; - } - } - return lsubvol; +afr_frame_return(call_frame_t *frame) +{ + afr_local_t *local = NULL; + int call_count = 0; + + local = frame->local; + + LOCK(&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + return call_count; } -void -afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, - int subvol) -{ - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; - - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - memcpy (local->fresh_children, success_children, - sizeof (*success_children) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - child = local->fresh_children[i]; - if (child == -1) - break; - if (child == subvol) - continue; - if (uuid_is_null (bufs[child].ia_gfid) && - (bufs[child].ia_type == bufs[subvol].ia_type)) - continue; - afr_children_rm_child (success_children, child, - priv->child_count); - local->success_count--; - } - afr_reset_children (local->fresh_children, priv->child_count); +static char *afr_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL}; + +gf_boolean_t +afr_is_xattr_ignorable(char *key) +{ + int i = 0; + + if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) + return _gf_true; + for (i = 0; afr_ignore_xattrs[i]; i++) { + if (!strcmp(key, afr_ignore_xattrs[i])) + return _gf_true; + } + return _gf_false; } -void -afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) +static gf_boolean_t +afr_xattr_match_needed(dict_t *this, char *key1, data_t *value1, void *data) { - int lsubvol = 0; + /* Ignore all non-disk (i.e. virtual) xattrs right away. */ + if (!gf_is_valid_xattr_namespace(key1)) + return _gf_false; - if (!afr_lookup_conflicting_entries (local, this)) - goto out; + /* Ignore on-disk xattrs that AFR doesn't need to heal. */ + if (!afr_is_xattr_ignorable(key1)) + return _gf_true; - lsubvol = afr_lookup_get_latest_subvol (local, this); - if (lsubvol < 0) - goto out; - afr_lookup_mark_other_entries_stale (local, this, lsubvol); -out: - return; + return _gf_false; } gf_boolean_t -afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) -{ - /* - * We need to perform this test in lookup done and treat on going - * create/DELETE as ENOENT. - * Reason: - Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' - - 1 Client A is in the middle of mkdir(/a). It has acquired lock. - It has performed mkdir(/a) on one subvol, and second one is still - in progress - 2 Client B performs a lookup, sees directory /a on one, - ENOENT on the other, succeeds lookup. - 3 Client B performs lookup on /a/b on both subvols, both return ENOENT - (one subvol because /a/b does not exist, another because /a - itself does not exist) - 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with - basename=b on one subvol, but fails on other subvol as /a is yet to - be created by Client A. - 5 Client A finishes mkdir of /a on other subvol - 6 Client C also attempts to create /a/b, lookup returns ENOENT on - both subvols. - 7 Client C tries to obtain entrylk on on inode=/a with basename=b, - obtains on one subvol (where B had failed), and waits for B to unlock - on other subvol. - 8 Client B finishes mkdir() on one subvol with GFID-1 and completes - transaction and unlocks - 9 Client C gets the lock on the second subvol, At this stage second - subvol already has /a/b created from Client B, but Client C does not - check that in the middle of mkdir transaction - 10 Client C attempts mkdir /a/b on both subvols. It succeeds on - ONLY ONE (where Client B could not get lock because of - missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. - This way we have /a/b in GFID mismatch. One subvol got GFID-1 because - Client B performed transaction on only one subvol (because entrylk() - could not be obtained on second subvol because of missing parent dir -- - caused by premature/speculative succeeding of lookup() on /a when locks - are detected). Other subvol gets GFID-2 from Client C because while - it was waiting for entrylk() on both subvols, Client B was in the - middle of creating mkdir() on only one subvol, and Client C does not - "expect" this when it is between lock() and pre-op()/op() phase of the - transaction. - */ - if (local->cont.lookup.parent_entrylk && local->enoent_count) - return _gf_true; - - return _gf_false; +afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2) +{ + return are_dicts_equal(dict1, dict2, afr_xattr_match_needed, NULL); } +static int +afr_get_parent_read_subvol(xlator_t *this, inode_t *parent, + struct afr_reply *replies, unsigned char *readable) +{ + int i = 0; + int par_read_subvol = -1; + int par_read_subvol_iter = -1; + afr_private_t *priv = NULL; -static void -afr_lookup_done (call_frame_t *frame, xlator_t *this) -{ - int unwind = 1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - gf_boolean_t sh_launched = _gf_false; - gf_boolean_t fail_conflict = _gf_false; - int gfid_miss_count = 0; - int enotconn_count = 0; - int up_children_count = 0; - - priv = this->private; - local = frame->local; + priv = this->private; - if (afr_is_entry_possibly_under_creation (local, this)) { - local->op_ret = -1; - local->op_errno = ENOENT; - goto unwind; - } + if (parent) + par_read_subvol = afr_data_subvol_get(parent, this, NULL, NULL, NULL, + NULL); - if (local->op_ret < 0) - goto unwind; - - if (local->cont.lookup.parent_entrylk && local->success_count > 1) - afr_succeed_lookup_on_latest_iatt (local, this); - - gfid_miss_count = afr_lookup_gfid_missing_count (local, this); - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - enotconn_count = priv->child_count - up_children_count; - if ((gfid_miss_count == local->success_count) && - (enotconn_count > 0)) { - local->op_ret = -1; - local->op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " - "LOOKUP on a file without gfid is not allowed when " - "some of the children are down", local->loc.path); - goto unwind; - } - - if ((gfid_miss_count == local->success_count) && - uuid_is_null (local->cont.lookup.gfid_req)) { - local->op_ret = -1; - local->op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", - local->loc.path); - goto unwind; - } - - if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) - fail_conflict = _gf_true; - ret = afr_lookup_done_success_action (frame, this, fail_conflict); - if (ret) - goto unwind; - uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - afr_lookup_perform_self_heal (frame, this, &sh_launched); - if (sh_launched) { - unwind = 0; - goto unwind; - } + if (replies[i].op_ret < 0) + continue; - unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + if (par_read_subvol_iter == -1) { + par_read_subvol_iter = i; + continue; } + + if ((par_read_subvol_iter != par_read_subvol) && readable[i]) + par_read_subvol_iter = i; + + if (i == par_read_subvol) + par_read_subvol_iter = i; + } + /* At the end of the for-loop, the only reason why @par_read_subvol_iter + * could be -1 is when this LOOKUP has failed on all sub-volumes. + * So it is okay to send an arbitrary subvolume (0 in this case) + * as parent read subvol. + */ + if (par_read_subvol_iter == -1) + par_read_subvol_iter = 0; + + return par_read_subvol_iter; } -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > EIO > ENOENT > others - */ -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio) +int +afr_read_subvol_decide(inode_t *inode, xlator_t *this, + afr_read_subvol_args_t *args, unsigned char *readable) { - if (old_errno == ESTALE || new_errno == ESTALE) - return ESTALE; - if (eio && (old_errno == EIO || new_errno == EIO)) - return EIO; - if (old_errno == ENOENT || new_errno == ENOENT) - return ENOENT; + int event = 0; + afr_private_t *priv = NULL; + unsigned char *intersection = NULL; - return new_errno; -} + priv = this->private; + intersection = alloca0(priv->child_count); -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count) -{ - int i = 0; - int32_t op_errno = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - op_errno = afr_most_important_error(op_errno, - child_errno[child], - _gf_false); - } - return op_errno; + afr_readables_intersect_get(inode, this, &event, intersection); + + if (AFR_COUNT(intersection, priv->child_count) <= 0) { + /* TODO: If we have one brick with valid data_readable and + * another with metadata_readable, try to send an iatt with + * valid bits from both.*/ + return -1; + } + + memcpy(readable, intersection, sizeof(*readable) * priv->child_count); + + return afr_read_subvol_select_by_policy(inode, this, intersection, args); } -static void -afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) +static inline int +afr_first_up_child(call_frame_t *frame, xlator_t *this) { - GF_ASSERT (local); - if (op_errno == ENOENT) - local->enoent_count++; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; - local->op_errno = afr_most_important_error(local->op_errno, op_errno, - _gf_false); + local = frame->local; + priv = this->private; - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } + for (i = 0; i < priv->child_count; i++) + if (local->replies[i].valid && local->replies[i].op_ret == 0) + return i; + return -1; } static void -afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, - inode_t *inode) +afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, + unsigned char *success_replies, + unsigned char *data_readable, int *read_subvol) { - afr_private_t *priv = NULL; - GF_ASSERT (inode); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int spb_subvol = -1; + int child_count = -1; - if (!__is_root_gfid (inode->gfid)) - goto out; - if (!afr_is_fresh_lookup (&local->loc, this)) - goto out; - priv = this->private; - if ((priv->first_lookup)) { - gf_log (this->name, GF_LOG_INFO, "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; - } -out: + if (*read_subvol != -1) return; -} -static void -afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, - struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (child_index >= 0); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparents[child_index] = *postparent; - local->cont.lookup.bufs[child_index] = *buf; + priv = this->private; + local = frame->local; + child_count = priv->child_count; + + afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol); + if ((spb_subvol >= 0) && + (AFR_COUNT(success_replies, child_count) == child_count)) { + *read_subvol = spb_subvol; + } else if (!priv->quorum_count || + frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) { + *read_subvol = afr_first_up_child(frame, this); + } else if (priv->quorum_count && + afr_has_quorum(data_readable, this, NULL)) { + /* read_subvol is guaranteed to be valid if we hit this path. */ + *read_subvol = afr_first_up_child(frame, this); + } else { + /* If quorum is enabled and we do not have a + readable yet, it means all good copies are down. + */ + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR, + "no read " + "subvols for %s", + local->loc.path); + } + if (*read_subvol >= 0) + dict_del_sizen(local->replies[*read_subvol].xdata, GF_CONTENT_KEY); } static void -afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, - inode_t *inode, struct iatt *buf) -{ - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.buf = *buf; - afr_set_root_inode_on_first_lookup (local, this, inode); -} +afr_lookup_done(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + int par_read_subvol = 0; + int ret = -1; + unsigned char *readable = NULL; + unsigned char *success_replies = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = { + 0, + }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t in_flight_create = _gf_false; + gf_boolean_t can_interpret = _gf_true; + inode_t *parent = NULL; + ia_type_t ia_type = IA_INVAL; + afr_read_subvol_args_t args = { + 0, + }; + char *gfid_heal_msg = NULL; + + priv = this->private; + local = frame->local; + replies = local->replies; + parent = local->loc.parent; + + locked_entry = afr_is_possibly_under_txn(AFR_ENTRY_TRANSACTION, local, + this); + + readable = alloca0(priv->child_count); + success_replies = alloca0(priv->child_count); + + afr_inode_read_subvol_get(parent, this, readable, NULL, &event); + par_read_subvol = afr_get_parent_read_subvol(this, parent, replies, + readable); + + /* First, check if we have a gfid-change from somewhere, + If so, propagate that so that a fresh lookup can be + issued + */ + if (local->cont.lookup.needs_fresh_lookup) { + local->op_ret = -1; + local->op_errno = ESTALE; + goto error; + } -static int32_t -afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) -{ - int ret = 0; - char *pathinfo = NULL; - gf_boolean_t is_local = _gf_false; - afr_private_t *priv = NULL; - int32_t child_index = -1; + op_errno = afr_final_errno(frame->local, this->private); + local->op_errno = op_errno; - if (op_ret != 0) { - goto out; + read_subvol = -1; + afr_fill_success_replies(local, priv, success_replies); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == -1) { + if (locked_entry && replies[i].op_errno == ENOENT) { + in_flight_create = _gf_true; + } + continue; } - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); - if (ret != 0) { - goto out; + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + gf_uuid_copy(read_gfid, replies[i].poststat.ia_gfid); + ia_type = replies[i].poststat.ia_type; + local->op_ret = 0; } + } - ret = afr_local_pathinfo (pathinfo, &is_local); + if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto error; + } + + if (read_subvol == -1) + goto error; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + continue; + } + + if (!gf_uuid_compare(replies[i].poststat.ia_gfid, read_gfid)) + continue; + + can_interpret = _gf_false; + + if (locked_entry) + continue; + + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; + + /* If we were called from glfsheal and there is still a gfid + * mismatch, succeed the lookup and let glfsheal print the + * response via gfid-heal-msg.*/ + if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", + &gfid_heal_msg)) + goto cant_interpret; + + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto error; + } + + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + read_subvol = -1; + memset(readable, 0, sizeof(*readable) * priv->child_count); + if (can_interpret) { + if (!afr_has_quorum(success_replies, this, NULL)) + goto cant_interpret; + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + gf_uuid_copy(args.gfid, read_gfid); + args.ia_type = ia_type; + ret = afr_replies_interpret(frame, this, local->inode, NULL); + read_subvol = afr_read_subvol_decide(local->inode, this, &args, + readable); + if (read_subvol == -1) + goto cant_interpret; if (ret) { - goto out; + afr_inode_need_refresh_set(local->inode, this); + dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY); + } + } else { + cant_interpret: + afr_attempt_readsubvol_set(frame, this, success_replies, readable, + &read_subvol); + if (read_subvol == -1) { + goto error; } + } - priv = this->private; - /* - * Note that one local subvolume will override another here. The only - * way to avoid that would be to retain extra information about whether - * the previous read_child is local, and it's just not worth it. Even - * the slowest local subvolume is far preferable to a remote one. - */ - if (is_local) { - child_index = (int32_t)(long)cookie; - gf_log (this->name, GF_LOG_INFO, - "selecting local read_child %s", - priv->children[child_index]->name); - priv->read_child = child_index; + afr_handle_quota_size(frame, this); + + afr_set_need_heal(this, local); + if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg_debug(this->name, 0, + "Arbiter cannot be a read subvol " + "for %s", + local->loc.path); + goto error; + } + + ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg); + if (!ret) { + ret = dict_set_str_sizen(local->replies[read_subvol].xdata, + "gfid-heal-msg", gfid_heal_msg); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "Error setting gfid-heal-msg dict"); + local->op_ret = -1; + local->op_errno = ENOMEM; } + } -out: - STACK_DESTROY(frame->root); - return 0; + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[par_read_subvol].postparent); + return; + +error: + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL, NULL); } -static void -afr_attempt_local_discovery (xlator_t *this, int32_t child_index) +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others + */ + +int +afr_higher_errno(int32_t old_errno, int32_t new_errno) { - call_frame_t *newframe = NULL; - loc_t tmploc = {0,}; - afr_private_t *priv = this->private; + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (old_errno == ENOSPC || new_errno == ENOSPC) + return ENOSPC; - newframe = create_frame(this,this->ctx->pool); - if (!newframe) { - return; - } + return new_errno; +} - tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; - STACK_WIND_COOKIE (newframe, afr_discovery_cbk, - (void *)(long)child_index, - priv->children[child_index], - priv->children[child_index]->fops->getxattr, - &tmploc, GF_XATTR_PATHINFO_KEY, NULL); +int +afr_final_errno(afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret >= 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno(op_errno, tmp_errno); + } + + return op_errno; +} + +static int32_t +afr_local_discovery_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; + + if (op_ret != 0) { + goto out; + } + + priv = this->private; + child_index = (int32_t)(long)cookie; + + ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; + } + + ret = glusterfs_is_local_pathinfo(pathinfo, &is_local); + if (ret) { + goto out; + } + + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + priv->local[child_index] = 1; + /* Don't set arbiter as read child. */ + if (AFR_IS_ARBITER_BRICK(priv, child_index)) + goto out; + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_LOCAL_CHILD, + "selecting local read_child %s", + priv->children[child_index]->name); + + priv->read_child = child_index; + } +out: + STACK_DESTROY(frame->root); + return 0; } static void -afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_attempt_local_discovery(xlator_t *this, int32_t child_index) { - afr_private_t *priv = this->private; + call_frame_t *newframe = NULL; + loc_t tmploc = { + 0, + }; + afr_private_t *priv = this->private; - if (local->success_count == 0) { - if (local->op_errno != ESTALE) { - local->op_ret = op_ret; - local->op_errno = 0; - } - afr_lookup_handle_first_success (local, this, inode, buf); - } - afr_lookup_update_lk_counts (local, this, - child_index, xattr); + newframe = create_frame(this, this->ctx->pool); + if (!newframe) { + return; + } - afr_lookup_cache_args (local, child_index, xattr, - buf, postparent); + tmploc.gfid[sizeof(tmploc.gfid) - 1] = 1; + STACK_WIND_COOKIE(newframe, afr_local_discovery_cbk, + (void *)(long)child_index, priv->children[child_index], + priv->children[child_index]->fops->getxattr, &tmploc, + GF_XATTR_PATHINFO_KEY, NULL); +} - if (local->do_discovery && (priv->read_child == (-1))) { - afr_attempt_local_discovery(this,child_index); - } +int +afr_lookup_sh_metadata_wrap(void *opaque) +{ + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0, first = -1; + int ret = -1; + dict_t *dict = NULL; + + local = frame->local; + this = frame->this; + priv = this->private; + replies = local->replies; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + first = i; + break; + } + if (first == -1) + goto out; + + if (afr_selfheal_metadata_by_stbuf(this, &replies[first].poststat)) + goto out; + + afr_local_replies_wipe(local, this->private); + + dict = dict_new(); + if (!dict) + goto out; + if (local->xattr_req) { + dict_copy(local->xattr_req, dict); + } + + ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } + + if (loc_is_nameless(&local->loc)) { + ret = afr_selfheal_unlocked_discover_on(frame, local->inode, + local->loc.gfid, local->replies, + local->child_up, dict); + } else { + inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up, dict); + } + if (inode) + inode_unref(inode); +out: + if (loc_is_nameless(&local->loc)) + afr_discover_done(frame, this); + else + afr_lookup_done(frame, this); - local->cont.lookup.success_children[local->success_count] = child_index; - local->success_count++; + if (dict) + dict_unref(dict); + + return 0; } -int -afr_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +gf_boolean_t +afr_is_pending_set(xlator_t *this, dict_t *xdata, int type) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; + int idx = -1; + afr_private_t *priv = NULL; + void *pending_raw = NULL; + int *pending_int = NULL; + int i = 0; - child_index = (long) cookie; + priv = this->private; + idx = afr_index_for_transaction_type(type); - LOCK (&frame->lock); - { - local = frame->local; + if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw) == 0) { + if (pending_raw) { + pending_int = pending_raw; - if (op_ret == -1) { - afr_lookup_handle_error (local, op_ret, op_errno); - goto unlock; - } - afr_lookup_handle_success (local, this, child_index, op_ret, - op_errno, inode, buf, xattr, - postparent); + if (ntoh32(pending_int[idx])) + return _gf_true; + } + } - } -unlock: - UNLOCK (&frame->lock); + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; + if (!pending_raw) + continue; + pending_int = pending_raw; - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this); - } + if (ntoh32(pending_int[idx])) + return _gf_true; + } - return 0; + return _gf_false; } -int -afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) -{ - int ret = -ENOMEM; - struct iatt *iatts = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - int32_t **pending_matrix = NULL; - - GF_ASSERT (local); - local->cont.lookup.xattrs = GF_CALLOC (child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); - if (NULL == local->cont.lookup.xattrs) - goto out; +static gf_boolean_t +afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0, first = -1; + gf_boolean_t start = _gf_false; + struct iatt stbuf = { + 0, + }; + + local = frame->local; + replies = local->replies; + priv = this->private; + + if (!priv->metadata_self_heal) + return _gf_false; - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.postparents = iatts; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (first == -1) { + first = i; + stbuf = replies[i].poststat; + continue; + } - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.bufs = iatts; + if (afr_is_pending_set(this, replies[i].xdata, + AFR_METADATA_TRANSACTION)) { + /* Let shd do the heal so that lookup is not blocked + * on getting metadata lock/doing the heal */ + start = _gf_false; + break; + } - success_children = afr_children_create (child_count); - if (NULL == success_children) - goto out; - local->cont.lookup.success_children = success_children; + if (gf_uuid_compare(stbuf.ia_gfid, replies[i].poststat.ia_gfid)) { + start = _gf_false; + break; + } + if (!IA_EQUAL(stbuf, replies[i].poststat, type)) { + start = _gf_false; + break; + } - local->fresh_children = afr_children_create (child_count); - if (NULL == local->fresh_children) - goto out; + /*Check if iattrs need heal*/ + if ((!IA_EQUAL(stbuf, replies[i].poststat, uid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, gid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, prot))) { + start = _gf_true; + continue; + } - sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - local->cont.lookup.sources = sources; + /*Check if xattrs need heal*/ + if (!afr_xattrs_are_equal(replies[first].xdata, replies[i].xdata)) + start = _gf_true; + } - pending_matrix = afr_matrix_create (child_count, child_count); - if (NULL == pending_matrix) - goto out; - local->cont.lookup.pending_matrix = pending_matrix; + return start; +} - ret = 0; +int +afr_lookup_metadata_heal_check(call_frame_t *frame, xlator_t *this) + +{ + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + + local = frame->local; + if (!afr_can_start_metadata_self_heal(frame, this)) + goto out; + + heal = afr_frame_create(this, &ret); + if (!heal) { + ret = -ret; + goto out; + } + + ret = synctask_new(this->ctx->env, afr_lookup_sh_metadata_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto out; + return ret; out: - return ret; + if (loc_is_nameless(&local->loc)) + afr_discover_done(frame, this); + else + afr_lookup_done(frame, this); + if (heal) + AFR_STACK_DESTROY(heal); + return ret; } int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +afr_lookup_selfheal_wrap(void *opaque) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - void *gfid_req = NULL; - int ret = -1; - int i = 0; - int call_count = 0; - uint64_t ctx = 0; - int32_t op_errno = 0; + int ret = 0; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; + uuid_t pargfid = { + 0, + }; - priv = this->private; + local = frame->local; + this = frame->this; + loc_pargfid(&local->loc, pargfid); - AFR_LOCAL_ALLOC_OR_GOTO (local, out); + ret = afr_selfheal_name(frame->this, pargfid, local->loc.name, + &local->cont.lookup.gfid_req, local->xattr_req); + if (ret == -EIO) + goto unwind; - local->op_ret = -1; + afr_local_replies_wipe(local, this->private); - frame->local = local; - local->fop = GF_FOP_LOOKUP; + inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up, local->xattr_req); + if (inode) + inode_unref(inode); - loc_copy (&local->loc, loc); - ret = loc_path (&local->loc, NULL); - if (ret < 0) { - op_errno = EINVAL; - goto out; - } - - if (local->loc.path && - (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { - op_errno = EPERM; - ret = -1; - goto out; - } + afr_lookup_metadata_heal_check(frame, this); + return 0; - ret = inode_ctx_get (local->loc.inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ +unwind: + AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); + return 0; +} - local->read_child_index = afr_inode_get_read_ctx (this, - local->loc.inode, - NULL); +int +afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t name_state_mismatch = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; + unsigned char *par_readables = NULL; + unsigned char *success = NULL; + int32_t op_errno = 0; + uuid_t gfid = {0}; + + local = frame->local; + replies = local->replies; + priv = this->private; + par_readables = alloca0(priv->child_count); + success = alloca0(priv->child_count); + + ret = afr_inode_read_subvol_get(local->loc.parent, this, par_readables, + NULL, NULL); + if (ret < 0 || AFR_COUNT(par_readables, priv->child_count) == 0) { + /* In this case set par_readables to all 1 so that name_heal + * need checks at the end of this function will flag missing + * entry when name state mismatches*/ + memset(par_readables, 1, priv->child_count); + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == 0) { + if (gf_uuid_is_null(gfid)) { + gf_uuid_copy(gfid, replies[i].poststat.ia_gfid); + } + success[i] = 1; } else { - LOCK (&priv->read_child_lock); - { - if (priv->hash_mode) { - local->read_child_index = -1; - } - else { - local->read_child_index = - (++priv->read_child_rr) % - (priv->child_count); - } - } - UNLOCK (&priv->read_child_lock); - local->cont.lookup.fresh_lookup = _gf_true; + if ((replies[i].op_errno != ENOTCONN) && + (replies[i].op_errno != ENOENT) && + (replies[i].op_errno != ESTALE)) { + op_errno = replies[i].op_errno; + } } - local->child_up = memdup (priv->child_up, - sizeof (*local->child_up) * priv->child_count); - if (NULL == local->child_up) { - op_errno = ENOMEM; - goto out; + /*gfid is missing, needs heal*/ + if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) { + goto name_heal; } - ret = afr_lookup_cont_init (local, priv->child_count); - if (ret < 0) { - op_errno = -ret; - goto out; + if (first == -1) { + first = i; + continue; } - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); - call_count = local->call_count; - if (local->call_count == 0) { - ret = -1; - op_errno = ENOTCONN; - goto out; + if (replies[i].op_ret != replies[first].op_ret) { + name_state_mismatch = _gf_true; } - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; - - ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, - &gfid_req); - if (ret) { - local->op_errno = -ret; - goto out; - } - afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, - &local->loc); - local->fop = GF_FOP_LOOKUP; - if (priv->choose_local && !priv->did_discovery) { - if (gfid_req && __is_root_gfid(gfid_req)) { - local->do_discovery = _gf_true; - priv->did_discovery = _gf_true; - } + if (replies[i].op_ret == 0) { + /* Rename after this lookup may succeed if we don't do + * a name-heal and the destination may not have pending xattrs + * to indicate which name is good and which is bad so always do + * this heal*/ + if (gf_uuid_compare(replies[i].poststat.ia_gfid, gfid)) { + goto name_heal; + } } + } + + if (name_state_mismatch) { + if (!priv->quorum_count) + goto name_heal; + if (!afr_has_quorum(success, this, NULL)) + goto name_heal; + if (op_errno) + goto name_heal; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, local->xattr_req); - if (!--call_count) - break; - } + if (!replies[i].valid) + continue; + if (par_readables[i] && replies[i].op_ret < 0 && + replies[i].op_errno != ENOTCONN) { + goto name_heal; + } } + } - ret = 0; -out: - if (ret) - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); - - return 0; -} + goto metadata_heal; +name_heal: + heal = afr_frame_create(this, NULL); + if (!heal) + goto metadata_heal; -/* {{{ open */ + ret = synctask_new(this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) { + AFR_STACK_DESTROY(heal); + goto metadata_heal; + } + return ret; -int -__afr_fd_ctx_set (xlator_t *this, fd_t *fd) -{ - afr_private_t * priv = NULL; - int ret = -1; - uint64_t ctx = 0; - afr_fd_ctx_t * fd_ctx = NULL; +metadata_heal: + ret = afr_lookup_metadata_heal_check(frame, this); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); + return ret; +} - priv = this->private; +int +afr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; + GF_UNUSED int ret = 0; + int8_t need_heal = 1; + + child_index = (long)cookie; + + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + /* + * On revalidate lookup if the gfid-changed, afr should unwind the fop + * with ESTALE so that a fresh lookup will be sent by the top xlator. + * So remember it. + */ + if (xdata && dict_get_sizen(xdata, "gfid-changed")) + local->cont.lookup.needs_fresh_lookup = _gf_true; + + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + local->replies[child_index].need_heal = need_heal; + } else { + local->replies[child_index].need_heal = need_heal; + } + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + } + + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + afr_lookup_entry_heal(frame, this); + } + + return 0; +} - ret = __fd_ctx_get (fd, this, &ctx); +static void +afr_discover_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int read_subvol = -1; + int ret = 0; + unsigned char *data_readable = NULL; + unsigned char *success_replies = NULL; - if (ret == 0) - goto out; + priv = this->private; + local = frame->local; + data_readable = alloca0(priv->child_count); + success_replies = alloca0(priv->child_count); - fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), - gf_afr_mt_afr_fd_ctx_t); - if (!fd_ctx) { - ret = -ENOMEM; - goto out; - } + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) > 0) + local->op_ret = 0; - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto out; - } + if (local->op_ret < 0) { + local->op_ret = -1; + local->op_errno = afr_final_errno(frame->local, this->private); + goto error; + } - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto out; - } + if (!afr_has_quorum(success_replies, this, frame)) + goto unwind; - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_int32_t); - if (!fd_ctx->opened_on) { - ret = -ENOMEM; - goto out; - } + ret = afr_replies_interpret(frame, this, local->inode, NULL); + if (ret) { + afr_inode_need_refresh_set(local->inode, this); + } - fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_piggyback) { - ret = -ENOMEM; - goto out; - } + read_subvol = afr_read_subvol_decide(local->inode, this, NULL, + data_readable); - fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_acquired) { - ret = -ENOMEM; - goto out; - } +unwind: + afr_attempt_readsubvol_set(frame, this, success_replies, data_readable, + &read_subvol); + if (read_subvol == -1) + goto error; - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; + if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg_debug(this->name, 0, + "Arbiter cannot be a read subvol " + "for %s", + local->loc.path); + } - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto out; - } + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); + return; - pthread_mutex_init (&fd_ctx->delay_lock, NULL); - INIT_LIST_HEAD (&fd_ctx->entries); - fd_ctx->call_child = -1; +error: + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL, NULL); +} - INIT_LIST_HEAD (&fd_ctx->eager_locked); +static int +afr_ta_id_file_check(void *opaque) +{ + afr_private_t *priv = NULL; + xlator_t *this = NULL; + loc_t loc = { + 0, + }; + struct iatt stbuf = { + 0, + }; + dict_t *dict = NULL; + uuid_t gfid = { + 0, + }; + fd_t *fd = NULL; + int ret = 0; + + this = opaque; + priv = this->private; + + ret = afr_fill_ta_loc(this, &loc, _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + goto out; + } + + ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, &stbuf, + 0, 0, 0); + if (ret == 0) { + goto out; + } else if (ret == -ENOENT) { + fd = fd_create(loc.inode, getpid()); + if (!fd) + goto out; + dict = dict_new(); + if (!dict) + goto out; + gf_uuid_generate(gfid); + ret = dict_set_gfuuid(dict, "gfid-req", gfid, true); + ret = syncop_create(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + O_RDWR, 0664, fd, &stbuf, dict, NULL); + } - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set fd ctx (%p)", fd); out: - return ret; + if (ret == 0) { + gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to lookup/create thin-arbiter id file."); + } + if (dict) + dict_unref(dict); + if (fd) + fd_unref(fd); + loc_wipe(&loc); + + return 0; } +static int +afr_ta_id_file_check_cbk(int ret, call_frame_t *ta_frame, void *opaque) +{ + return 0; +} -int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) +static void +afr_discover_done(call_frame_t *frame, xlator_t *this) { - int ret = -1; + int ret = 0; + afr_private_t *priv = NULL; - LOCK (&fd->lock); - { - ret = __afr_fd_ctx_set (this, fd); - } - UNLOCK (&fd->lock); + priv = this->private; + if (!priv->thin_arbiter_count) + goto unwind; + if (!gf_uuid_is_null(priv->ta_gfid)) + goto unwind; - return ret; + ret = synctask_new(this->ctx->env, afr_ta_id_file_check, + afr_ta_id_file_check_cbk, NULL, this); + if (ret) + goto unwind; +unwind: + afr_discover_unwind(frame, this); } -/* {{{ flush */ - int -afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - afr_local_t * local = NULL; - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - } + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; + GF_UNUSED int ret = 0; + int8_t need_heal = 1; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + child_index = (long)cookie; - call_count = afr_frame_return (frame); + local = frame->local; - if (call_count == 0) - AFR_STACK_UNWIND(flush, frame, local->op_ret, - local->op_errno, NULL); + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + } - return 0; -} + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery(this, child_index); -static int -afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - int i = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + local->replies[child_index].need_heal = need_heal; + } else { + local->replies[child_index].need_heal = need_heal; + } - priv = this->private; - local = frame->local; - call_count = local->call_count; + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + afr_lookup_metadata_heal_check(frame, this); + } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - local->fd, NULL); - if (!--call_count) - break; + return 0; +} - } +int +afr_discover_do(call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err) { + local->op_errno = err; + goto out; + } + + call_count = local->call_count = AFR_COUNT(local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE( + frame, afr_discover_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &local->loc, local->xattr_req); + if (!--call_count) + break; } + } - return 0; + return 0; +out: + AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - call_stub_t *stub = NULL; - int ret = -1; - int op_errno = 0; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } - ret = afr_local_init(local, priv, &op_errno); - if (ret < 0) - goto out; + if (__is_root_gfid(loc->inode->gfid)) { + if (!priv->root_inode) + priv->root_inode = inode_ref(loc->inode); - local->fd = fd_ref(fd); - stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); - if (!stub) { - ret = -1; - op_errno = ENOMEM; - goto out; + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; } + } - afr_delayed_changelog_wake_resume (this, fd, stub); - ret = 0; + local->op = GF_FOP_LOOKUP; -out: - if (ret < 0) - AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + loc_copy(&local->loc, loc); + + local->inode = inode_ref(loc->inode); + if (xattr_req) { + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_copy_with_ref(xattr_req, NULL); + if (!local->xattr_req) { + op_errno = ENOMEM; + goto out; + } + } + + if (gf_uuid_is_null(loc->inode->gfid)) { + afr_discover_do(frame, this, 0); return 0; + } + + afr_read_subvol_get(loc->inode, this, NULL, NULL, &event, + AFR_DATA_TRANSACTION, NULL); + + afr_discover_do(frame, this, 0); + + return 0; +out: + AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } -/* }}} */ +int +afr_lookup_do(call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err < 0) { + local->op_errno = err; + goto out; + } + + call_count = local->call_count = AFR_COUNT(local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE( + frame, afr_lookup_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &local->loc, local->xattr_req); + if (!--call_count) + break; + } + } + return 0; +out: + AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; +} +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ int -afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) +afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) - goto out; + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; + int ret = 0; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + if (loc_is_nameless(loc)) { + if (xattr_req) + dict_del_sizen(xattr_req, "gfid-req"); + afr_discover(frame, this, loc, xattr_req); + return 0; + } - if (fd_ctx) { - GF_FREE (fd_ctx->pre_op_done); + if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name, + frame->root->pid)) { + op_errno = EPERM; + goto out; + } - GF_FREE (fd_ctx->opened_on); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - GF_FREE (fd_ctx->locked_on); + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } - GF_FREE (fd_ctx->pre_op_piggyback); - GF_FREE (fd_ctx->lock_piggyback); + local->op = GF_FOP_LOOKUP; - GF_FREE (fd_ctx->lock_acquired); + loc_copy(&local->loc, loc); - pthread_mutex_destroy (&fd_ctx->delay_lock); + local->inode = inode_ref(loc->inode); - GF_FREE (fd_ctx); + if (xattr_req) { + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_copy_with_ref(xattr_req, NULL); + if (!local->xattr_req) { + op_errno = ENOMEM; + goto out; + } + ret = dict_get_gfuuid(local->xattr_req, "gfid-req", + &local->cont.lookup.gfid_req); + if (ret == 0) { + dict_del_sizen(local->xattr_req, "gfid-req"); } + } + afr_read_subvol_get(loc->parent, this, NULL, NULL, &event, + AFR_DATA_TRANSACTION, NULL); + + afr_lookup_do(frame, this, 0); + + return 0; out: - return 0; + AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; } +void +_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx) +{ + afr_private_t *priv = this->private; + + if (fd_ctx->lk_heal_info) { + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info); + fd_ctx->lk_heal_info = NULL; + } + GF_FREE(fd_ctx->opened_on); + GF_FREE(fd_ctx); + return; +} int -afr_release (xlator_t *this, fd_t *fd) +afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd) { - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - afr_private_t *priv = NULL; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = 0; - priv = this->private; + ret = fd_ctx_get(fd, this, &ctx); + if (ret < 0) + goto out; - afr_cleanup_fd_ctx (this, fd); + fd_ctx = (afr_fd_ctx_t *)(long)ctx; - list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, - list) { + if (fd_ctx) { + _afr_cleanup_fd_ctx(this, fd_ctx); + } - if (locked_fd->fd == fd) { - list_del_init (&locked_fd->list); - GF_FREE (locked_fd); - } +out: + return 0; +} - } +int +afr_release(xlator_t *this, fd_t *fd) +{ + afr_cleanup_fd_ctx(this, fd); - return 0; + return 0; } +afr_fd_ctx_t * +__afr_fd_ctx_get(fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; -/* {{{ fsync */ + ret = __fd_ctx_get(fd, this, &ctx); -int -afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) + if (ret < 0) { + ret = __afr_fd_ctx_set(this, fd); + if (ret < 0) + goto out; + + ret = __fd_ctx_get(fd, this, &ctx); + if (ret < 0) + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long)ctx; +out: + return fd_ctx; +} + +afr_fd_ctx_t * +afr_fd_ctx_get(fd_t *fd, xlator_t *this) { - AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; + afr_fd_ctx_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + } + UNLOCK(&fd->lock); + + return fd_ctx; } int -afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +__afr_fd_ctx_set(xlator_t *this, fd_t *fd) { - afr_local_t *local = NULL; - int call_count = -1; - int child_index = (long) cookie; - int read_child = 0; - call_stub_t *stub = NULL; + afr_private_t *priv = NULL; + int ret = -1; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; - local = frame->local; + VALIDATE_OR_GOTO(this->private, out); + VALIDATE_OR_GOTO(fd, out); - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + priv = this->private; - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + ret = __fd_ctx_get(fd, this, &ctx); - if (op_ret == 0) { - local->op_ret = 0; + if (ret == 0) + goto out; - if (local->success_count == 0) { - local->cont.inode_wfop.prebuf = *prebuf; - local->cont.inode_wfop.postbuf = *postbuf; - } + fd_ctx = GF_CALLOC(1, sizeof(afr_fd_ctx_t), gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } - if (child_index == read_child) { - local->cont.inode_wfop.prebuf = *prebuf; - local->cont.inode_wfop.postbuf = *postbuf; - } + fd_ctx->opened_on = GF_CALLOC(sizeof(*fd_ctx->opened_on), priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } - local->success_count++; - } + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous(fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - /* Make a stub out of the frame, and register it - with the waking up post-op. When the call-stub resumes, - we are guaranteed that there was no post-op pending - (i.e changelogs were unset in the server). This is an - essential "guarantee", that fsync() returns only after - completely finishing EVERYTHING, including the delayed - post-op. This guarantee is expected by FUSE graph switching - for example. - */ - stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, - local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - xdata); - if (!stub) { - AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); - return 0; - } - - /* If no new unstable writes happened between the - time we cleared the unstable write witness flag in afr_fsync - and now, calling afr_delayed_changelog_wake_up() should - wake up and skip over the fsync phase and go straight to - afr_changelog_post_op_now() - */ - afr_delayed_changelog_wake_resume (this, local->fd, stub); - } + fd_ctx->readdir_subvol = -1; + fd_ctx->lk_heal_info = NULL; - return 0; + ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx); + if (ret) + gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd); +out: + if (ret && fd_ctx) + _afr_cleanup_fd_ctx(this, fd_ctx); + return ret; } +/* {{{ flush */ int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; + afr_local_t *local = NULL; + int call_count = -1; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; - priv = this->private; + LOCK(&frame->lock); + { + if (op_ret != -1) { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } else { + local->op_errno = op_errno; + } + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + if (call_count == 0) + AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno, + local->xdata_rsp); - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + return 0; +} - call_count = local->call_count; +static int +afr_flush_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_flush_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->flush, + local->fd, xdata); + if (!--call_count) + break; + } + } - local->fd = fd_ref (fd); + return 0; +} - if (afr_fd_has_witnessed_unstable_write (this, fd)) { - /* don't care. we only wanted to CLEAR the bit */ - } +afr_local_t * +afr_wakeup_same_fd_delayed_op(xlator_t *this, afr_lock_t *lock, fd_t *fd) +{ + afr_local_t *local = NULL; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_fsync_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsync, - fd, datasync, xdata); - if (!--call_count) - break; - } + if (lock->delay_timer) { + local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + if (fd == local->fd) { + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + local = NULL; + } else { + lock->delay_timer = NULL; + } + } else { + local = NULL; } + } - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return local; } -/* }}} */ - -/* {{{ fsync */ +void +afr_delayed_changelog_wake_resume(xlator_t *this, inode_t *inode, + call_stub_t *stub) +{ + afr_inode_ctx_t *ctx = NULL; + afr_lock_t *lock = NULL; + afr_local_t *metadata_local = NULL; + afr_local_t *data_local = NULL; + LOCK(&inode->lock); + { + (void)__afr_inode_ctx_get(this, inode, &ctx); + lock = &ctx->lock[AFR_DATA_TRANSACTION]; + data_local = afr_wakeup_same_fd_delayed_op(this, lock, stub->args.fd); + lock = &ctx->lock[AFR_METADATA_TRANSACTION]; + metadata_local = afr_wakeup_same_fd_delayed_op(this, lock, + stub->args.fd); + } + UNLOCK(&inode->lock); + + if (data_local) { + data_local->transaction.resume_stub = stub; + } else if (metadata_local) { + metadata_local->transaction.resume_stub = stub; + } else { + call_resume(stub); + } + if (data_local) { + afr_delayed_changelog_wake_up_cbk(data_local); + } + if (metadata_local) { + afr_delayed_changelog_wake_up_cbk(metadata_local); + } +} -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +int +afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = -1; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int op_errno = ENOMEM; - local = frame->local; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + local->op = GF_FOP_FLUSH; + if (!afr_is_consistent_io_possible(local, this->private, &op_errno)) + goto out; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local->fd = fd_ref(fd); - call_count = afr_frame_return (frame); + stub = fop_flush_stub(frame, afr_flush_wrapper, fd, xdata); + if (!stub) + goto out; - if (call_count == 0) - AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno, xdata); + afr_delayed_changelog_wake_resume(this, fd->inode, stub); - return 0; + return 0; +out: + AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + return 0; } - -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +int +afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; + afr_local_t *local = NULL; + int call_count = -1; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; - priv = this->private; + LOCK(&frame->lock); + { + if (op_ret == 0) { + local->op_ret = 0; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } else { + local->op_errno = op_errno; + } + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + if (call_count == 0) + AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno, + local->xdata_rsp); - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + return 0; +} - call_count = local->call_count; +int +afr_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fsyncdir_cbk, - priv->children[i], - priv->children[i]->fops->fsyncdir, - fd, datasync, xdata); - if (!--call_count) - break; - } + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_FSYNCDIR; + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND(frame, afr_fsyncdir_cbk, priv->children[i], + priv->children[i]->fops->fsyncdir, fd, datasync, xdata); + if (!--call_count) + break; } + } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); - return 0; + AFR_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL); + + return 0; } /* }}} */ -/* {{{ xattrop */ +static int +afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this); -int32_t -afr_xattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) +static gf_boolean_t +afr_is_conflicting_lock_present(int32_t op_ret, int32_t op_errno) { - afr_local_t *local = NULL; - int call_count = -1; - - local = frame->local; + if (op_ret == -1 && op_errno == EAGAIN) + return _gf_true; + return _gf_false; +} - LOCK (&frame->lock); - { - if (op_ret == 0) { - if (!local->cont.xattrop.xattr) - local->cont.xattrop.xattr = dict_ref (xattr); - local->op_ret = 0; - } +static void +afr_fop_lock_unwind(call_frame_t *frame, glusterfs_fop_t op, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + switch (op) { + case GF_FOP_INODELK: + AFR_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata); + break; + case GF_FOP_FINODELK: + AFR_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata); + break; + case GF_FOP_ENTRYLK: + AFR_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata); + break; + case GF_FOP_FENTRYLK: + AFR_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata); + break; + default: + break; + } +} - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); +static void +afr_fop_lock_wind(call_frame_t *frame, xlator_t *this, int child_index, + int32_t (*lock_cbk)(call_frame_t *, void *, xlator_t *, + int32_t, int32_t, dict_t *)) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = child_index; + + switch (local->op) { + case GF_FOP_INODELK: + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->inodelk, + (const char *)local->cont.inodelk.volume, &local->loc, + local->cont.inodelk.cmd, &local->cont.inodelk.flock, + local->cont.inodelk.xdata); + break; + case GF_FOP_FINODELK: + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->finodelk, + (const char *)local->cont.inodelk.volume, local->fd, + local->cont.inodelk.cmd, &local->cont.inodelk.flock, + local->cont.inodelk.xdata); + break; + case GF_FOP_ENTRYLK: + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->entrylk, local->cont.entrylk.volume, + &local->loc, local->cont.entrylk.basename, + local->cont.entrylk.cmd, local->cont.entrylk.type, + local->cont.entrylk.xdata); + break; + case GF_FOP_FENTRYLK: + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->fentrylk, local->cont.entrylk.volume, + local->fd, local->cont.entrylk.basename, + local->cont.entrylk.cmd, local->cont.entrylk.type, + local->cont.entrylk.xdata); + break; + default: + break; + } +} - call_count = afr_frame_return (frame); +void +afr_fop_lock_proceed(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (call_count == 0) - AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - local->cont.xattrop.xattr, xdata); + local = frame->local; + priv = frame->this->private; - return 0; + if (local->fop_lock_state != AFR_FOP_LOCK_PARALLEL) { + afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno, + local->xdata_rsp); + return; + } + /* At least one child is up */ + /* + * Non-blocking locks also need to be serialized. Otherwise there is + * a chance that both the mounts which issued same non-blocking inodelk + * may endup not acquiring the lock on any-brick. + * Ex: Mount1 and Mount2 + * request for full length lock on file f1. Mount1 afr may acquire the + * partial lock on brick-1 and may not acquire the lock on brick-2 + * because Mount2 already got the lock on brick-2, vice versa. Since + * both the mounts only got partial locks, afr treats them as failure in + * gaining the locks and unwinds with EAGAIN errno. + */ + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop_lock_state = AFR_FOP_LOCK_SERIAL; + afr_local_replies_wipe(local, priv); + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.cmd = local->cont.inodelk.in_cmd; + local->cont.inodelk.flock = local->cont.inodelk.in_flock; + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + local->cont.inodelk.xdata = NULL; + if (local->xdata_req) + local->cont.inodelk.xdata = dict_ref(local->xdata_req); + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.cmd = local->cont.entrylk.in_cmd; + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + local->cont.entrylk.xdata = NULL; + if (local->xdata_req) + local->cont.entrylk.xdata = dict_ref(local->xdata_req); + break; + default: + break; + } + afr_serialized_lock_wind(frame, frame->this); } +static int32_t +afr_unlock_partial_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) -int32_t -afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int child_index = (long)cookie; + uuid_t gfid = {0}; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; + priv = this->private; - priv = this->private; + if (op_ret < 0 && op_errno != ENOTCONN) { + if (local->fd) + gf_uuid_copy(gfid, local->fd->inode->gfid); + else + loc_gfid(&local->loc, gfid); + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "%s: Failed to unlock %s on %s " + "with lk_owner: %s", + uuid_utoa(gfid), gf_fop_list[local->op], + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + call_count = afr_frame_return(frame); + if (call_count == 0) + afr_fop_lock_proceed(frame); - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + return 0; +} - call_count = local->call_count; +static int32_t +afr_unlock_locks_and_proceed(call_frame_t *frame, xlator_t *this, + int call_count) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + if (call_count == 0) { + afr_fop_lock_proceed(frame); + goto out; + } + + local = frame->local; + priv = this->private; + local->call_count = call_count; + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.flock.l_type = F_UNLCK; + local->cont.inodelk.cmd = F_SETLK; + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + local->cont.inodelk.xdata = NULL; + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.cmd = ENTRYLK_UNLOCK; + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + local->cont.entrylk.xdata = NULL; + break; + default: + break; + } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_xattrop_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - loc, optype, xattr, xdata); - if (!--call_count) - break; - } - } + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); - return 0; -} + if (local->replies[i].op_ret == -1) + continue; -/* }}} */ + afr_fop_lock_wind(frame, this, i, afr_unlock_partial_lock_cbk); -/* {{{ fxattrop */ + if (!--call_count) + break; + } + +out: + return 0; +} int32_t -afr_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) +afr_fop_lock_done(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + int i = 0; + int lock_count = 0; + unsigned char *success = NULL; - int call_count = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; + local = frame->local; + priv = this->private; + success = alloca0(priv->child_count); - LOCK (&frame->lock); - { - if (op_ret == 0) { - if (!local->cont.fxattrop.xattr) - local->cont.fxattrop.xattr = dict_ref (xattr); + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - local->op_ret = 0; - } + if (local->replies[i].op_ret == 0) { + lock_count++; + success[i] = 1; + } + + if (local->op_ret == -1 && local->op_errno == EAGAIN) + continue; - local->op_errno = op_errno; + if ((local->replies[i].op_ret == -1) && + (local->replies[i].op_errno == EAGAIN)) { + local->op_ret = -1; + local->op_errno = EAGAIN; + continue; } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + if (local->replies[i].op_ret == 0) + local->op_ret = 0; - if (call_count == 0) - AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - local->cont.fxattrop.xattr, xdata); + local->op_errno = local->replies[i].op_errno; + } - return 0; + if (afr_fop_lock_is_unlock(frame)) + goto unwind; + + if (afr_is_conflicting_lock_present(local->op_ret, local->op_errno)) { + afr_unlock_locks_and_proceed(frame, this, lock_count); + } else if (priv->quorum_count && !afr_has_quorum(success, this, NULL)) { + local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED; + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + if (local->op_errno == 0) + local->op_errno = afr_quorum_errno(priv); + afr_unlock_locks_and_proceed(frame, this, lock_count); + } else { + goto unwind; + } + + return 0; +unwind: + afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } +static int +afr_common_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long)cookie; + + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret == 0 && xdata) { + local->replies[child_index].xdata = dict_ref(xdata); + LOCK(&frame->lock); + { + if (!local->xdata_rsp) + local->xdata_rsp = dict_ref(xdata); + } + UNLOCK(&frame->lock); + } + return 0; +} + +static int32_t +afr_serialized_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) -int32_t -afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long)cookie; + int next_child = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; + priv = this->private; - priv = this->private; + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + for (next_child = child_index + 1; next_child < priv->child_count; + next_child++) { + if (local->child_up[next_child]) + break; + } - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (afr_is_conflicting_lock_present(op_ret, op_errno) || + (next_child == priv->child_count)) { + afr_fop_lock_done(frame, this); + } else { + afr_fop_lock_wind(frame, this, next_child, afr_serialized_lock_cbk); + } - call_count = local->call_count; + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fxattrop_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - fd, optype, xattr, xdata); - if (!--call_count) - break; - } +static int +afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + afr_fop_lock_wind(frame, this, i, afr_serialized_lock_cbk); + break; } + } + return 0; +} - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); - return 0; +static int32_t +afr_parallel_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + int call_count = 0; + + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + + call_count = afr_frame_return(frame); + if (call_count == 0) + afr_fop_lock_done(frame, this); + + return 0; } -/* }}} */ +static int +afr_parallel_lock_wind(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int i = 0; + priv = this->private; + local = frame->local; + call_count = local->call_count; -int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + afr_fop_lock_wind(frame, this, i, afr_parallel_lock_cbk); + if (!--call_count) + break; + } + return 0; +} +static int +afr_fop_handle_lock(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - int call_count = -1; + afr_local_t *local = frame->local; + int op_errno = 0; - local = frame->local; + if (!afr_fop_lock_is_unlock(frame)) { + if (!afr_is_consistent_io_possible(local, this->private, &op_errno)) + goto out; - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.cmd = F_SETLK; + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.cmd = ENTRYLK_LOCK_NB; + break; + default: + break; + } + } - local->op_errno = op_errno; + if (local->xdata_req) { + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.xdata = dict_ref(local->xdata_req); + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.xdata = dict_ref(local->xdata_req); + break; + default: + break; } - UNLOCK (&frame->lock); + } - call_count = afr_frame_return (frame); + local->fop_lock_state = AFR_FOP_LOCK_PARALLEL; + afr_parallel_lock_wind(frame, this); +out: + return -op_errno; +} - if (call_count == 0) - AFR_STACK_UNWIND (inodelk, frame, local->op_ret, - local->op_errno, xdata); +static int32_t +afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = fop; + if (loc) + loc_copy(&local->loc, loc); + if (fd && (flock->l_type != F_UNLCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local->fd = fd_ref(fd); + } + + local->cont.inodelk.volume = gf_strdup(volume); + if (!local->cont.inodelk.volume) { + op_errno = ENOMEM; + goto out; + } + + local->cont.inodelk.in_cmd = cmd; + local->cont.inodelk.cmd = cmd; + local->cont.inodelk.in_flock = *flock; + local->cont.inodelk.flock = *flock; + if (xdata) + local->xdata_req = dict_ref(xdata); + + op_errno = -afr_fop_handle_lock(frame, frame->this); + if (op_errno) + goto out; + return 0; +out: + afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL); - return 0; + return 0; } +int32_t +afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd, + flock, xdata); + return 0; +} int32_t -afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, - struct gf_flock *flock, dict_t *xdata) +afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd, + flock, xdata); + return 0; +} + +static int +afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = fop; + if (loc) + loc_copy(&local->loc, loc); + if (fd && (cmd != ENTRYLK_UNLOCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local->fd = fd_ref(fd); + } + local->cont.entrylk.cmd = cmd; + local->cont.entrylk.in_cmd = cmd; + local->cont.entrylk.type = type; + local->cont.entrylk.volume = gf_strdup(volume); + local->cont.entrylk.basename = gf_strdup(basename); + if (!local->cont.entrylk.volume || !local->cont.entrylk.basename) { + op_errno = ENOMEM; + goto out; + } + if (xdata) + local->xdata_req = dict_ref(xdata); + op_errno = -afr_fop_handle_lock(frame, frame->this); + if (op_errno) + goto out; + + return 0; +out: + afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL); + return 0; +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +int +afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename, + cmd, type, xdata); + return 0; +} - priv = this->private; +int +afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename, + cmd, type, xdata); + return 0; +} - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; +int +afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = 0; + struct statvfs *buf = NULL; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = frame->local; - call_count = local->call_count; + LOCK(&frame->lock); + { + if (op_ret != 0) { + local->op_errno = op_errno; + goto unlock; + } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_inodelk_cbk, - priv->children[i], - priv->children[i]->fops->inodelk, - volume, loc, cmd, flock, xdata); - - if (!--call_count) - break; + local->op_ret = op_ret; + + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(xdata); } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref(xdata); } + } +unlock: + call_count = --local->call_count; + UNLOCK(&frame->lock); - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); - return 0; + if (call_count == 0) + AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno, + &local->cont.statfs.buf, local->xdata_rsp); + + return 0; } +int +afr_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_STATFS; + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX]) + local->call_count--; + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + if (AFR_IS_ARBITER_BRICK(priv, i)) + continue; + STACK_WIND(frame, afr_statfs_cbk, priv->children[i], + priv->children[i]->fops->statfs, loc, xdata); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); + + return 0; +} int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, +afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) - { - afr_local_t *local = NULL; - int call_count = -1; - - local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + int call_count = -1; + int child_index = (long)cookie; - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local = frame->local; - call_count = afr_frame_return (frame); + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } - if (call_count == 0) - AFR_STACK_UNWIND (finodelk, frame, local->op_ret, - local->op_errno, xdata); + call_count = afr_frame_return(frame); + if (call_count == 0) { + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, + local->xdata_rsp); + } - return 0; + return 0; } - int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +afr_lk_unlock(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; + priv = this->private; - priv = this->private; + call_count = afr_locked_nodes_count(local->cont.lk.locked_nodes, + priv->child_count); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + if (call_count == 0) { + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, + local->xdata_rsp); + return 0; + } - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local->call_count = call_count; - call_count = local->call_count; + local->cont.lk.user_flock.l_type = F_UNLCK; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_finodelk_cbk, - priv->children[i], - priv->children[i]->fops->finodelk, - volume, fd, cmd, flock, xdata); - - if (!--call_count) - break; - } + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.locked_nodes[i]) { + STACK_WIND_COOKIE(frame, afr_lk_unlock_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, + NULL); + + if (!--call_count) + break; } + } - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); - return 0; + return 0; } - int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = -1; - local = frame->local; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + child_index = (long)cookie; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret < 0 && op_errno == EAGAIN) { + local->op_ret = -1; + local->op_errno = EAGAIN; - call_count = afr_frame_return (frame); + afr_lk_unlock(frame, this); + return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + + child_index++; + + if (child_index < priv->child_count) { + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + } else if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); - if (call_count == 0) - AFR_STACK_UNWIND (entrylk, frame, local->op_ret, - local->op_errno, xdata); + afr_lk_unlock(frame, this); + } else { + if (local->op_ret < 0) + local->op_errno = afr_final_errno(local, priv); - return 0; -} + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } + return 0; +} -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type, - dict_t *xdata) +int +afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + return 0; +} - priv = this->private; +int +afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = -1; + + local = frame->local; + child_index = (long)cookie; + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + syncbarrier_wake(&local->barrier); + return 0; +} - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; +int +afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int child_index = (long)cookie; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + return 0; +} +int +afr_lk_transaction(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *wind_on = NULL; + int op_errno = 0; + int i = 0; + int ret = 0; + + frame = (call_frame_t *)opaque; + local = frame->local; + this = frame->this; + priv = this->private; + wind_on = alloca0(priv->child_count); + + if (priv->arbiter_count || priv->child_count != 3) { + op_errno = ENOTSUP; + gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Lock healing supported only for replica 3 volumes.", + uuid_utoa(local->fd->inode->gfid)); + goto err; + } + + op_errno = -afr_dom_lock_acquire(frame); // Released during + // AFR_STACK_UNWIND + if (op_errno != 0) { + goto err; + } + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) { + op_errno = afr_final_errno(local, priv); + goto err; + } + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i]) + wind_on[i] = 1; + } + AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + goto unlock; + } else { + if (local->cont.lk.user_flock.l_type == F_UNLCK) + ret = afr_remove_lock_from_saved_locks(local, this); + else + ret = afr_add_lock_to_saved_locks(frame, this); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; + } + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + return 0; - call_count = local->call_count; +unlock: + local->cont.lk.user_flock.l_type = F_UNLCK; + AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, NULL); +err: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + return -1; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_entrylk_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type, xdata); - - if (!--call_count) - break; - } +int +afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = 0; + int i = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_LK; + if (!afr_lk_is_unlock(cmd, flock)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + } + + local->cont.lk.locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + + if (!local->cont.lk.locked_nodes) { + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref(fd); + local->cont.lk.cmd = cmd; + local->cont.lk.user_flock = *flock; + local->cont.lk.ret_flock = *flock; + if (xdata) + local->xdata_req = dict_ref(xdata); + + if (afr_is_lock_mode_mandatory(xdata)) { + ret = synctask_new(this->ctx->env, afr_lk_transaction, + afr_lk_transaction_cbk, frame, frame); + if (ret) { + op_errno = ENOMEM; + goto out; } - - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); return 0; -} + } + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i], + priv->children[i]->fops->lk, fd, cmd, flock, + local->xdata_req); + return 0; +out: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) + return 0; +} +int32_t +afr_lease_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_lease *lease, + dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + afr_local_t *local = NULL; + int call_count = -1; - call_count = afr_frame_return (frame); + local = frame->local; + call_count = afr_frame_return(frame); - if (call_count == 0) - AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, - local->op_errno, xdata); + if (call_count == 0) + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, lease, + xdata); - return 0; + return 0; } - int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +afr_lease_unlock(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; + priv = this->private; - priv = this->private; + call_count = afr_locked_nodes_count(local->cont.lease.locked_nodes, + priv->child_count); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + if (call_count == 0) { + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, + &local->cont.lease.ret_lease, NULL); + return 0; + } - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local->call_count = call_count; - call_count = local->call_count; + local->cont.lease.user_lease.cmd = GF_UNLK_LEASE; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fentrylk_cbk, - priv->children[i], - priv->children[i]->fops->fentrylk, - volume, fd, basename, cmd, type, xdata); - - if (!--call_count) - break; - } + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lease.locked_nodes[i]) { + STACK_WIND(frame, afr_lease_unlock_cbk, priv->children[i], + priv->children[i]->fops->lease, &local->loc, + &local->cont.lease.user_lease, NULL); + + if (!--call_count) + break; } + } - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); - return 0; + return 0; } int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs, dict_t *xdata) +afr_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_lease *lease, dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = -1; - LOCK (&frame->lock); - { - local = frame->local; - - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } - } + local = frame->local; + priv = this->private; - if (op_ret == -1) - local->op_errno = op_errno; + child_index = (long)cookie; - } - UNLOCK (&frame->lock); + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret < 0 && op_errno == EAGAIN) { + local->op_ret = -1; + local->op_errno = EAGAIN; - call_count = afr_frame_return (frame); + afr_lease_unlock(frame, this); + return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lease.locked_nodes[child_index] = 1; + local->cont.lease.ret_lease = *lease; + } + + child_index++; + if (child_index < priv->child_count) { + STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->lease, &local->loc, + &local->cont.lease.user_lease, xdata); + } else if (priv->quorum_count && + !afr_has_quorum(local->cont.lease.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); - if (call_count == 0) - AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf, xdata); + afr_lease_unlock(frame, this); + } else { + if (local->op_ret < 0) + local->op_errno = afr_final_errno(local, priv); + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, + &local->cont.lease.ret_lease, NULL); + } - return 0; + return 0; } - -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata) +int +afr_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) { - afr_private_t * priv = NULL; - int child_count = 0; - afr_local_t * local = NULL; - int i = 0; - int ret = -1; - int call_count = 0; - int32_t op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + priv = this->private; - priv = this->private; - child_count = priv->child_count; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local->op = GF_FOP_LEASE; + local->cont.lease.locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lease.locked_nodes), + gf_afr_mt_char); - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (!local->cont.lease.locked_nodes) { + op_errno = ENOMEM; + goto out; + } - call_count = local->call_count; + loc_copy(&local->loc, loc); + local->cont.lease.user_lease = *lease; + local->cont.lease.ret_lease = *lease; - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_statfs_cbk, - priv->children[i], - priv->children[i]->fops->statfs, - loc, xdata); - if (!--call_count) - break; - } - } + STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)0, priv->children[0], + priv->children[0]->fops->lease, loc, lease, xdata); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); - return 0; -} - + AFR_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL); -int32_t -afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - afr_local_t * local = NULL; - int call_count = -1; + return 0; +} - local = frame->local; - call_count = afr_frame_return (frame); +int +afr_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = 0; + gf_boolean_t failed = _gf_false; + gf_boolean_t succeeded = _gf_false; + int i = 0; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + + call_count = afr_frame_return(frame); + if (call_count) + goto out; + /* If any of the subvolumes failed with other than ENOTCONN + * return error else return success unless all the subvolumes + * failed. + * TODO: In case of failure, we need to unregister the xattrs + * from the other subvolumes where it succeeded (once upcall + * fixes the Bz-1371622)*/ + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno != ENOTCONN) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + if (local->replies[i].xdata) { + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + failed = _gf_true; + break; + } + if (local->replies[i].op_ret == 0) { + succeeded = _gf_true; + local->op_ret = 0; + local->op_errno = 0; + if (!local->xdata_rsp && local->replies[i].xdata) { + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + } + } + + if (!succeeded && !failed) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + } - if (call_count == 0) - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - lock, xdata); + AFR_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno, + local->xdata_rsp); - return 0; +out: + return 0; } - -int32_t -afr_lk_unlock (call_frame_t *frame, xlator_t *this) +int +afr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int i = 0; - int call_count = 0; + afr_local_t *local = NULL; + int32_t op_errno = -1; + afr_private_t *priv = NULL; + int i = 0; + int call_cnt = -1; - local = frame->local; - priv = this->private; + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); - call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, - priv->child_count); + if (op != GF_IPC_TARGET_UPCALL) + goto wind_default; - if (call_count == 0) { - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock, NULL); - return 0; - } + VALIDATE_OR_GOTO(this->private, err); + priv = this->private; - local->call_count = call_count; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto err; - local->cont.lk.user_flock.l_type = F_UNLCK; + call_cnt = local->call_count; + if (xdata) { for (i = 0; i < priv->child_count; i++) { - if (local->cont.lk.locked_nodes[i]) { - STACK_WIND (frame, afr_lk_unlock_cbk, - priv->children[i], - priv->children[i]->fops->lk, - local->fd, F_SETLK, - &local->cont.lk.user_flock, NULL); - - if (!--call_count) - break; - } + if (dict_set_int8(xdata, priv->pending_key[i], 0) < 0) + goto err; } + } - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + STACK_WIND_COOKIE(frame, afr_ipc_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->ipc, op, + xdata); + if (!--call_cnt) + break; + } + return 0; -int32_t -afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +err: + if (op_errno == -1) + op_errno = errno; + AFR_STACK_UNWIND(ipc, frame, -1, op_errno, NULL); + + return 0; + +wind_default: + STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + return 0; +} + +int +afr_forget(xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int child_index = -1; -/* int ret = 0; */ + uint64_t ctx_int = 0; + afr_inode_ctx_t *ctx = NULL; + afr_spb_choice_timeout_cancel(this, inode); + inode_ctx_del(inode, this, &ctx_int); + if (!ctx_int) + return 0; - local = frame->local; - priv = this->private; + ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int; + afr_inode_ctx_destroy(ctx); + return 0; +} - child_index = (long) cookie; +int +afr_priv_dump(xlator_t *this) +{ + afr_private_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + GF_ASSERT(this); + priv = this->private; + + GF_ASSERT(priv); + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("child_count", "%u", priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sprintf(key, "child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->child_up[i]); + sprintf(key, "pending_key[%d]", i); + gf_proc_dump_write(key, "%s", priv->pending_key[i]); + sprintf(key, "pending_reads[%d]", i); + gf_proc_dump_write(key, "%" PRId64, + GF_ATOMIC_GET(priv->pending_reads[i])); + sprintf(key, "child_latency[%d]", i); + gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]); + sprintf(key, "halo_child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->halo_child_up[i]); + } + gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); + gf_proc_dump_write("heal-wait-queue-length", "%d", priv->heal_wait_qlen); + gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters); + gf_proc_dump_write("background-self-heal-count", "%d", + priv->background_self_heal_count); + gf_proc_dump_write("healers", "%d", priv->healers); + gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode); + gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode); + if (priv->quorum_count == AFR_QUORUM_AUTO) { + gf_proc_dump_write("quorum-type", "auto"); + } else if (priv->quorum_count == 0) { + gf_proc_dump_write("quorum-type", "none"); + } else { + gf_proc_dump_write("quorum-type", "fixed"); + gf_proc_dump_write("quorum-count", "%d", priv->quorum_count); + } + gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this, NULL)); + if (priv->thin_arbiter_count) { + gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up); + gf_proc_dump_write("ta_bad_child_index", "%d", + priv->ta_bad_child_index); + gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64, + priv->ta_notify_dom_lock_offset); + } + + return 0; +} - if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { - local->op_ret = -1; - local->op_errno = op_errno; +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ - afr_lk_unlock (frame, this); - return 0; - } +static int +afr_find_child_index(xlator_t *this, xlator_t *child) +{ + afr_private_t *priv = NULL; + int child_count = -1; + int i = -1; - if (op_ret == 0) { - local->op_ret = 0; - local->op_errno = 0; - local->cont.lk.locked_nodes[child_index] = 1; - local->cont.lk.ret_flock = *lock; - } + priv = this->private; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } - child_index++; + for (i = 0; i < child_count; i++) { + if ((xlator_t *)child == priv->children[i]) + break; + } - if (child_index < priv->child_count) { - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->lk, - local->fd, local->cont.lk.cmd, - &local->cont.lk.user_flock, xdata); - } else if (local->op_ret == -1) { - /* all nodes have gone down */ + return i; +} - AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, - &local->cont.lk.ret_flock, NULL); - } else { - /* locking has succeeded on all nodes that are up */ +int +__afr_get_up_children_count(afr_private_t *priv) +{ + int up_children = 0; + int i = 0; - /* temporarily - ret = afr_mark_locked_nodes (this, local->fd, - local->cont.lk.locked_nodes); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked nodes info in fdctx"); + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; - ret = afr_save_locked_fd (this, local->fd); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked fd"); + return up_children; +} - */ - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock, NULL); - } +static int +__get_heard_from_all_status(xlator_t *this) +{ + afr_private_t *priv = this->private; + int i; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + return 0; + } + } + if (priv->thin_arbiter_count && !priv->ta_child_up) { return 0; + } + return 1; } - -int -afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) +glusterfs_event_t +__afr_transform_event_from_state(xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int32_t op_errno = 0; - int ret = -1; + int i = 0; + int up_children = 0; + afr_private_t *priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if (__get_heard_from_all_status(this)) + /* have_heard_from_all. Let afr_notify() do the propagation. */ + return GF_EVENT_MAXVAL; - priv = this->private; + up_children = __afr_get_up_children_count(priv); + /* Treat the children with pending notification, as having sent a + * GF_EVENT_CHILD_DOWN. i.e. set the event as GF_EVENT_SOME_DESCENDENT_DOWN, + * as done in afr_notify() */ + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i]) + continue; + priv->last_event[i] = GF_EVENT_SOME_DESCENDENT_DOWN; + priv->child_up[i] = 0; + } - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + if (up_children) + /* We received at least one child up */ + return GF_EVENT_CHILD_UP; + else + return GF_EVENT_CHILD_DOWN; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + return GF_EVENT_MAXVAL; +} + +static void +afr_notify_cbk(void *data) +{ + xlator_t *this = data; + afr_private_t *priv = this->private; + glusterfs_event_t event = GF_EVENT_MAXVAL; + gf_boolean_t propagate = _gf_false; + + LOCK(&priv->lock); + { + if (!priv->timer) { + /* + * Either child_up/child_down is already sent to parent. + * This is a spurious wake up. + */ + goto unlock; + } + priv->timer = NULL; + event = __afr_transform_event_from_state(this); + if (event != GF_EVENT_MAXVAL) + propagate = _gf_true; + } +unlock: + UNLOCK(&priv->lock); + if (propagate) + default_notify(this, event, NULL); +} - local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, - sizeof (*local->cont.lk.locked_nodes), - gf_afr_mt_char); +static void +__afr_launch_notify_timer(xlator_t *this, afr_private_t *priv) +{ + struct timespec delay = { + 0, + }; - if (!local->cont.lk.locked_nodes) { - op_errno = ENOMEM; - goto out; - } + gf_msg_debug(this->name, 0, "Initiating child-down timer"); + delay.tv_sec = 10; + delay.tv_nsec = 0; + priv->timer = gf_timer_call_after(this->ctx, delay, afr_notify_cbk, this); + if (priv->timer == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL, + "Cannot create timer for delayed initialization"); + } +} - local->fd = fd_ref (fd); - local->cont.lk.cmd = cmd; - local->cont.lk.user_flock = *flock; - local->cont.lk.ret_flock = *flock; +static int +find_best_down_child(xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t best_child = -1; + int64_t best_latency = INT64_MAX; - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, - priv->children[i], - priv->children[i]->fops->lk, - fd, cmd, flock, xdata); + priv = this->private; - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i] && priv->child_latency[i] >= 0 && + priv->child_latency[i] < best_latency) { + best_child = i; + best_latency = priv->child_latency[i]; + } + } + if (best_child >= 0) { + gf_msg_debug(this->name, 0, + "Found best down child (%d) @ %" PRId64 " ms latency", + best_child, best_latency); + } + return best_child; } int -afr_forget (xlator_t *this, inode_t *inode) +find_worst_up_child(xlator_t *this) { - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + int i = -1; + int32_t worst_child = -1; + int64_t worst_latency = INT64_MIN; - inode_ctx_get (inode, this, &ctx_addr); + priv = this->private; - if (!ctx_addr) - goto out; + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && priv->child_latency[i] >= 0 && + priv->child_latency[i] > worst_latency) { + worst_child = i; + worst_latency = priv->child_latency[i]; + } + } + if (worst_child >= 0) { + gf_msg_debug(this->name, 0, + "Found worst up child (%d) @ %" PRId64 " ms latency", + worst_child, worst_latency); + } + return worst_child; +} - ctx = (afr_inode_ctx_t *)(long)ctx_addr; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -out: - return 0; +void +__afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx, + int64_t halo_max_latency_msec, int32_t *event, + int64_t child_latency_msec) +{ + afr_private_t *priv = NULL; + int up_children = 0; + + priv = this->private; + + priv->child_latency[idx] = child_latency_msec; + gf_msg_debug(child_xlator->name, 0, "Client ping @ %" PRId64 " ms", + child_latency_msec); + if (priv->shd.iamshd) + return; + + up_children = __afr_get_up_children_count(priv); + + if (child_latency_msec > halo_max_latency_msec && + priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) { + if ((up_children - 1) < priv->halo_min_replicas) { + gf_log(child_xlator->name, GF_LOG_INFO, + "Overriding halo threshold, " + "min replicas: %d", + priv->halo_min_replicas); + } else { + gf_log(child_xlator->name, GF_LOG_INFO, + "Child latency (%" PRId64 + " ms) " + "exceeds halo threshold (%" PRId64 + "), " + "marking child down.", + child_latency_msec, halo_max_latency_msec); + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_DOWN; + } + } + } else if (child_latency_msec < halo_max_latency_msec && + priv->child_up[idx] == 0) { + if (up_children < priv->halo_max_replicas) { + gf_log(child_xlator->name, GF_LOG_INFO, + "Child latency (%" PRId64 + " ms) " + "below halo threshold (%" PRId64 + "), " + "marking child up.", + child_latency_msec, halo_max_latency_msec); + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_UP; + } + } else { + gf_log(child_xlator->name, GF_LOG_INFO, + "Not marking child %d up, " + "max replicas (%d) reached.", + idx, priv->halo_max_replicas); + } + } } -int -afr_priv_dump (xlator_t *this) +static int64_t +afr_get_halo_latency(xlator_t *this) { - afr_private_t *priv = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; + afr_private_t *priv = NULL; + int64_t halo_max_latency_msec = 0; + priv = this->private; - GF_ASSERT (this); - priv = this->private; + if (priv->shd.iamshd) { + halo_max_latency_msec = priv->shd.halo_max_latency_msec; + } else if (priv->nfsd.iamnfsd) { + halo_max_latency_msec = priv->nfsd.halo_max_latency_msec; + } else { + halo_max_latency_msec = priv->halo_max_latency_msec; + } + gf_msg_debug(this->name, 0, "Using halo latency %" PRId64, + halo_max_latency_msec); + return halo_max_latency_msec; +} - GF_ASSERT (priv); - snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); - gf_proc_dump_add_section(key_prefix); - gf_proc_dump_write("child_count", "%u", priv->child_count); - gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr); - for (i = 0; i < priv->child_count; i++) { - sprintf (key, "child_up[%d]", i); - gf_proc_dump_write(key, "%d", priv->child_up[i]); - sprintf (key, "pending_key[%d]", i); - gf_proc_dump_write(key, "%s", priv->pending_key[i]); - } - gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); - gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); - gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); - gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); - gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); - gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); - gf_proc_dump_write("read_child", "%d", priv->read_child); - gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); - gf_proc_dump_write("wait_count", "%u", priv->wait_count); +void +__afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t child_latency_msec, + int32_t *event, int32_t *call_psh, + int32_t *up_child) +{ + afr_private_t *priv = NULL; + int up_children = 0; + int worst_up_child = -1; + int64_t halo_max_latency_msec = afr_get_halo_latency(this); + + priv = this->private; + + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->event_generation++; + } + priv->child_up[idx] = 1; + + *call_psh = 1; + *up_child = idx; + up_children = __afr_get_up_children_count(priv); + /* + * If this is an _actual_ CHILD_UP event, we + * want to set the child_latency to MAX to indicate + * the child needs ping data to be available before doing child-up + */ + if (!priv->halo_enabled) + goto out; + + if (child_latency_msec < 0) { + /*set to INT64_MAX-1 so that it is found for best_down_child*/ + priv->halo_child_up[idx] = 1; + if (priv->child_latency[idx] < 0) { + priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; + } + } + + /* + * Handle the edge case where we exceed + * halo_min_replicas and we've got a child which is + * marked up as it was helping to satisfy the + * halo_min_replicas even though it's latency exceeds + * halo_max_latency_msec. + */ + if (up_children > priv->halo_min_replicas) { + worst_up_child = find_worst_up_child(this); + if (worst_up_child >= 0 && + priv->child_latency[worst_up_child] > halo_max_latency_msec) { + gf_msg_debug(this->name, 0, + "Marking child %d down, " + "doesn't meet halo threshold (%" PRId64 + "), and > " + "halo_min_replicas (%d)", + worst_up_child, halo_max_latency_msec, + priv->halo_min_replicas); + priv->child_up[worst_up_child] = 0; + up_children--; + } + } + + if (up_children > priv->halo_max_replicas && !priv->shd.iamshd) { + worst_up_child = find_worst_up_child(this); + if (worst_up_child < 0) { + worst_up_child = idx; + } + priv->child_up[worst_up_child] = 0; + up_children--; + gf_msg_debug(this->name, 0, + "Marking child %d down, " + "up_children (%d) > halo_max_replicas (%d)", + worst_up_child, up_children, priv->halo_max_replicas); + } +out: + if (up_children == 1) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, + "Subvolume '%s' came back up; " + "going online.", + child_xlator->name); + gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } else { + *event = GF_EVENT_SOME_DESCENDENT_UP; + } - return 0; + priv->last_event[idx] = *event; +} + +void +__afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, + int64_t child_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child) +{ + afr_private_t *priv = NULL; + int i = 0; + int up_children = 0; + int down_children = 0; + int best_down_child = -1; + + priv = this->private; + + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->event_generation++; + } + + /* + * If this is an _actual_ CHILD_DOWN event, we + * want to set the child_latency to < 0 to indicate + * the child is really disconnected. + */ + if (child_latency_msec < 0) { + priv->child_latency[idx] = child_latency_msec; + priv->halo_child_up[idx] = 0; + } + priv->child_up[idx] = 0; + + up_children = __afr_get_up_children_count(priv); + /* + * Handle the edge case where we need to find the + * next best child (to mark up) as marking this child + * down would cause us to fall below halo_min_replicas. + * We will also force the SHD to heal this child _now_ + * as we want it to be up to date if we are going to + * begin using it synchronously. + */ + if (priv->halo_enabled && up_children < priv->halo_min_replicas) { + best_down_child = find_best_down_child(this); + if (best_down_child >= 0) { + gf_msg_debug(this->name, 0, + "Swapping out child %d for " + "child %d to satisfy halo_min_replicas (%d).", + idx, best_down_child, priv->halo_min_replicas); + priv->child_up[best_down_child] = 1; + *call_psh = 1; + *up_child = best_down_child; + } + } + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN, + "All subvolumes are down. Going " + "offline until at least one of them " + "comes back up."); + gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } else { + *event = GF_EVENT_SOME_DESCENDENT_DOWN; + } + priv->last_event[idx] = *event; } +void +afr_ta_lock_release_synctask(xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + int ret = 0; -/** - * find_child_index - find the child's index in the array of subvolumes - * @this: AFR - * @child: child - */ + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + return; + } -static int -find_child_index (xlator_t *this, xlator_t *child) + ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta, + afr_ta_lock_release_done, ta_frame, this); + if (ret) { + STACK_DESTROY(ta_frame->root); + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to release " + "AFR_TA_DOM_NOTIFY lock."); + } +} + +static void +afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall) { - afr_private_t *priv = NULL; - int i = -1; + struct gf_upcall_inodelk_contention *lc = NULL; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + afr_private_t *priv = this->private; - priv = this->private; + lc = upcall->data; - for (i = 0; i < priv->child_count; i++) { - if ((xlator_t *) child == priv->children[i]) - break; - } + if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0) + return; - return i; + if (priv->shd.iamshd) { + /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */ + return; + } + LOCK(&priv->lock); + { + if (priv->release_ta_notify_dom_lock == _gf_true) { + /* Ignore multiple release requests from shds.*/ + UNLOCK(&priv->lock); + return; + } + priv->release_ta_notify_dom_lock = _gf_true; + inmem_count = priv->ta_in_mem_txn_count; + onwire_count = priv->ta_on_wire_txn_count; + } + UNLOCK(&priv->lock); + if (inmem_count || onwire_count) + /* lock release will happen in txn code path after + * in-memory or on-wire txns are over.*/ + return; + + afr_ta_lock_release_synctask(this); +} + +static void +afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall) +{ + struct gf_upcall_cache_invalidation *up_ci = NULL; + afr_private_t *priv = this->private; + inode_t *inode = NULL; + inode_table_t *itable = NULL; + int i = 0; + + switch (upcall->event_type) { + case GF_UPCALL_INODELK_CONTENTION: + afr_handle_inodelk_contention(this, upcall); + break; + case GF_UPCALL_CACHE_INVALIDATION: + up_ci = (struct gf_upcall_cache_invalidation *)upcall->data; + + /* Since md-cache will be aggressively filtering + * lookups, the stale read issue will be more + * pronounced. Hence when a pending xattr is set notify + * all the md-cache clients to invalidate the existing + * stat cache and send the lookup next time */ + if (!up_ci->dict) + break; + for (i = 0; i < priv->child_count; i++) { + if (!dict_get(up_ci->dict, priv->pending_key[i])) + continue; + up_ci->flags |= UP_INVAL_ATTR; + itable = ((xlator_t *)this->graph->top)->itable; + /*Internal processes may not have itable for + *top xlator*/ + if (itable) + inode = inode_find(itable, upcall->gfid); + if (inode) + afr_inode_need_refresh_set(inode, this); + break; + } + break; + default: + break; + } } int32_t -afr_notify (xlator_t *this, int32_t event, - void *data, void *data2) -{ - afr_private_t *priv = NULL; - int i = -1; - int up_children = 0; - int down_children = 0; - int propagate = 0; - int had_heard_from_all = 0; - int have_heard_from_all = 0; - int idx = -1; - int ret = -1; - int call_psh = 0; - int up_child = AFR_ALL_CHILDREN; - dict_t *input = NULL; - dict_t *output = NULL; - - priv = this->private; - - if (!priv) - return 0; - - /* - * We need to reset this in case children come up in "staggered" - * fashion, so that we discover a late-arriving local subvolume. Note - * that we could end up issuing N lookups to the first subvolume, and - * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. - */ - priv->did_discovery = _gf_false; +afr_notify(xlator_t *this, int32_t event, void *data, void *data2) +{ + afr_private_t *priv = NULL; + xlator_t *child_xlator = NULL; + int i = -1; + int propagate = 0; + int had_heard_from_all = 0; + int have_heard_from_all = 0; + int idx = -1; + int ret = -1; + int call_psh = 0; + int up_child = -1; + dict_t *input = NULL; + dict_t *output = NULL; + gf_boolean_t had_quorum = _gf_false; + gf_boolean_t has_quorum = _gf_false; + int64_t halo_max_latency_msec = 0; + int64_t child_latency_msec = -1; + + child_xlator = (xlator_t *)data; + + priv = this->private; + + if (!priv) + return 0; - had_heard_from_all = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->last_event[i]) { - had_heard_from_all = 0; - } + /* + * We need to reset this in case children come up in "staggered" + * fashion, so that we discover a late-arriving local subvolume. Note + * that we could end up issuing N lookups to the first subvolume, and + * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. + */ + priv->did_discovery = _gf_false; + + /* parent xlators don't need to know about every child_up, child_down + * because of afr ha. If all subvolumes go down, child_down has + * to be triggered. In that state when 1 subvolume comes up child_up + * needs to be triggered. dht optimizes revalidate lookup by sending + * it only to one of its subvolumes. When child up/down happens + * for afr's subvolumes dht should be notified by child_modified. The + * subsequent revalidate lookup happens on all the dht's subvolumes + * which triggers afr self-heals if any. + */ + idx = afr_find_child_index(this, child_xlator); + if (idx < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, + "Received child_up from invalid subvolume"); + goto out; + } + + had_quorum = priv->quorum_count && + afr_has_quorum(priv->child_up, this, NULL); + if (event == GF_EVENT_CHILD_PING) { + child_latency_msec = (int64_t)(uintptr_t)data2; + if (priv->halo_enabled) { + halo_max_latency_msec = afr_get_halo_latency(this); + + /* Calculates the child latency and sets event + */ + LOCK(&priv->lock); + { + __afr_handle_ping_event(this, child_xlator, idx, + halo_max_latency_msec, &event, + child_latency_msec); + } + UNLOCK(&priv->lock); + } else { + LOCK(&priv->lock); + { + priv->child_latency[idx] = child_latency_msec; + } + UNLOCK(&priv->lock); } + } - /* parent xlators dont need to know about every child_up, child_down - * because of afr ha. If all subvolumes go down, child_down has - * to be triggered. In that state when 1 subvolume comes up child_up - * needs to be triggered. dht optimizes revalidate lookup by sending - * it only to one of its subvolumes. When child up/down happens - * for afr's subvolumes dht should be notified by child_modified. The - * subsequent revalidate lookup happens on all the dht's subvolumes - * which triggers afr self-heals if any. + if (event == GF_EVENT_CHILD_PING) { + /* This is the only xlator that handles PING, no reason to + * propagate. */ - idx = find_child_index (this, data); - if (idx < 0) { - gf_log (this->name, GF_LOG_ERROR, "Received child_up " - "from invalid subvolume"); - goto out; + goto out; + } + + if (event == GF_EVENT_TRANSLATOR_OP) { + LOCK(&priv->lock); + { + had_heard_from_all = __get_heard_from_all_status(this); + } + UNLOCK(&priv->lock); + + if (!had_heard_from_all) { + ret = -1; + } else { + input = data; + output = data2; + ret = afr_xl_op(this, input, output); } + goto out; + } + if (event == GF_EVENT_UPCALL) { + afr_handle_upcall_event(this, data); + } + + LOCK(&priv->lock); + { + had_heard_from_all = __get_heard_from_all_status(this); switch (event) { - case GF_EVENT_CHILD_UP: - LOCK (&priv->lock); - { - /* - * This only really counts if the child was never up - * (value = -1) or had been down (value = 0). See - * comment at GF_EVENT_CHILD_DOWN for a more detailed - * explanation. - */ - if (priv->child_up[idx] != 1) { - priv->up_count++; - } - priv->child_up[idx] = 1; - - call_psh = 1; - up_child = idx; - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 1) - up_children++; - if (up_children == 1) { - gf_log (this->name, GF_LOG_INFO, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); - } else { - event = GF_EVENT_CHILD_MODIFIED; - } - - priv->last_event[idx] = event; + case GF_EVENT_PARENT_UP: + __afr_launch_notify_timer(this, priv); + propagate = 1; + break; + case GF_EVENT_CHILD_UP: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 1; + priv->ta_event_gen++; + break; } - UNLOCK (&priv->lock); - + __afr_handle_child_up_event(this, child_xlator, idx, + child_latency_msec, &event, + &call_psh, &up_child); + __afr_lock_heal_synctask(this, priv, idx); break; - case GF_EVENT_CHILD_DOWN: - LOCK (&priv->lock); - { - /* - * If a brick is down when we start, we'll get a - * CHILD_DOWN to indicate its initial state. There - * was never a CHILD_UP in this case, so if we - * increment "down_count" the difference between than - * and "up_count" will no longer be the number of - * children that are currently up. This has serious - * implications e.g. for quorum enforcement, so we - * don't increment these values unless the event - * represents an actual state transition between "up" - * (value = 1) and anything else. - */ - if (priv->child_up[idx] == 1) { - priv->down_count++; - } - priv->child_up[idx] = 0; - - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 0) - down_children++; - if (down_children == priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); - } else { - event = GF_EVENT_CHILD_MODIFIED; - } - - priv->last_event[idx] = event; + case GF_EVENT_CHILD_DOWN: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 0; + priv->ta_event_gen++; + afr_ta_locked_priv_invalidate(priv); + break; } - UNLOCK (&priv->lock); - + __afr_handle_child_down_event(this, child_xlator, idx, + child_latency_msec, &event, + &call_psh, &up_child); + __afr_mark_pending_lk_heal(this, priv, idx); break; - case GF_EVENT_CHILD_CONNECTING: - LOCK (&priv->lock); - { - priv->last_event[idx] = event; - } - UNLOCK (&priv->lock); + case GF_EVENT_CHILD_CONNECTING: + priv->last_event[idx] = event; break; - case GF_EVENT_TRANSLATOR_OP: - input = data; - output = data2; - ret = afr_xl_op (this, input, output); - goto out; + case GF_EVENT_SOME_DESCENDENT_DOWN: + priv->last_event[idx] = event; break; - - default: + default: propagate = 1; break; } - - /* have all subvolumes reported status once by now? */ - have_heard_from_all = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->last_event[i]) - have_heard_from_all = 0; - } - - /* if all subvols have reported status, no need to hide anything - or wait for anything else. Just propagate blindly */ - if (have_heard_from_all) - propagate = 1; - + have_heard_from_all = __get_heard_from_all_status(this); if (!had_heard_from_all && have_heard_from_all) { - /* This is the first event which completes aggregation - of events from all subvolumes. If at least one subvol - had come up, propagate CHILD_UP, but only this time - */ - event = GF_EVENT_CHILD_DOWN; - - LOCK (&priv->lock); - { - up_children = afr_up_children_count (priv->child_up, - priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (priv->last_event[i] == GF_EVENT_CHILD_UP) { - event = GF_EVENT_CHILD_UP; - break; - } - - if (priv->last_event[i] == - GF_EVENT_CHILD_CONNECTING) { - event = GF_EVENT_CHILD_CONNECTING; - /* continue to check other events for CHILD_UP */ - } - } + if (priv->timer) { + gf_timer_call_cancel(this->ctx, priv->timer); + priv->timer = NULL; + } + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; } - UNLOCK (&priv->lock); - } - ret = 0; - if (propagate) - ret = default_notify (this, event, data); - if (call_psh && priv->shd.iamshd) - afr_proactive_self_heal ((void*) (long) up_child); + if (priv->last_event[i] == GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } + } + } + } + UNLOCK(&priv->lock); + + if (priv->quorum_count) { + has_quorum = afr_has_quorum(priv->child_up, this, NULL); + if (!had_quorum && has_quorum) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET, + "Client-quorum is met"); + gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } + if (had_quorum && !has_quorum) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL, + "Client-quorum is not met"); + gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } + } + + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) + propagate = 1; + + ret = 0; + if (propagate) + ret = default_notify(this, event, data); + + if ((!had_heard_from_all) || call_psh) { + /* Launch self-heal on all local subvolumes if: + * a) We have_heard_from_all for the first time + * b) Already heard from everyone, but we now got a child-up + * event. + */ + if (have_heard_from_all) { + afr_selfheal_childup(this, priv); + } + } +out: + return ret; +} +int +afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) +{ + int __ret = -1; + local->op_ret = -1; + local->op_errno = EUCLEAN; + + __ret = syncbarrier_init(&local->barrier); + if (__ret) { + if (op_errno) + *op_errno = __ret; + goto out; + } + + local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up), + gf_afr_mt_char); + if (!local->child_up) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + memcpy(local->child_up, priv->child_up, + sizeof(*local->child_up) * priv->child_count); + local->call_count = AFR_COUNT(local->child_up, priv->child_count); + if (local->call_count == 0) { + gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN, + "no subvolumes up"); + if (op_errno) + *op_errno = ENOTCONN; + goto out; + } + + local->event_generation = priv->event_generation; + + local->read_attempted = GF_CALLOC(priv->child_count, sizeof(char), + gf_afr_mt_char); + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->readable = GF_CALLOC(priv->child_count, sizeof(char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->readable2 = GF_CALLOC(priv->child_count, sizeof(char), + gf_afr_mt_char); + if (!local->readable2) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->read_subvol = -1; + + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->need_full_crawl = _gf_false; + if (priv->thin_arbiter_count) { + local->ta_child_up = priv->ta_child_up; + local->ta_failed_subvol = AFR_CHILD_UNKNOWN; + local->read_txn_query_child = AFR_CHILD_UNKNOWN; + local->ta_event_gen = priv->ta_event_gen; + local->fop_state = TA_SUCCESS; + } + local->is_new_entry = _gf_false; + + INIT_LIST_HEAD(&local->healer); + return 0; out: - return ret; + return -1; } int -afr_first_up_child (unsigned char *child_up, size_t child_count) +afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count) { - int ret = -1; - int i = 0; + int ret = -ENOMEM; - GF_ASSERT (child_up); + lk->lower_locked_nodes = GF_CALLOC(sizeof(*lk->lower_locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->lower_locked_nodes) + goto out; - for (i = 0; i < child_count; i++) { - if (child_up[i]) { - ret = i; - break; - } - } + lk->lock_op_ret = -1; + lk->lock_op_errno = EUCLEAN; - return ret; + ret = 0; +out: + return ret; } -int -afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) +void +afr_matrix_cleanup(int32_t **matrix, unsigned int m) { - int ret = -1; + int i = 0; - local->op_ret = -1; - local->op_errno = EUCLEAN; - - local->child_up = GF_CALLOC (priv->child_count, - sizeof (*local->child_up), - gf_afr_mt_char); - if (!local->child_up) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } - - memcpy (local->child_up, priv->child_up, - sizeof (*local->child_up) * priv->child_count); - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); - if (local->call_count == 0) { - gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); - if (op_errno) - *op_errno = ENOTCONN; - goto out; - } + if (!matrix) + goto out; + for (i = 0; i < m; i++) { + GF_FREE(matrix[i]); + } - local->child_errno = GF_CALLOC (priv->child_count, - sizeof (*local->child_errno), - gf_afr_mt_int32_t); - if (!local->child_errno) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + GF_FREE(matrix); +out: + return; +} - local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, - sizeof (int), - gf_afr_mt_int32_t); - if (!local->transaction.postop_piggybacked) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } +int32_t ** +afr_matrix_create(unsigned int m, unsigned int n) +{ + int32_t **matrix = NULL; + int i = 0; - local->append_write = _gf_false; + matrix = GF_CALLOC(sizeof(*matrix), m, gf_afr_mt_int32_t); + if (!matrix) + goto out; - ret = 0; + for (i = 0; i < m; i++) { + matrix[i] = GF_CALLOC(sizeof(*matrix[i]), n, gf_afr_mt_int32_t); + if (!matrix[i]) + goto out; + } + return matrix; out: - return ret; + afr_matrix_cleanup(matrix, m); + return NULL; } int -afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, - transaction_lk_type_t lk_type) +afr_transaction_local_init(afr_local_t *local, xlator_t *this) +{ + int ret = -ENOMEM; + afr_private_t *priv = NULL; + + priv = this->private; + INIT_LIST_HEAD(&local->transaction.wait_list); + INIT_LIST_HEAD(&local->transaction.owner_list); + INIT_LIST_HEAD(&local->ta_waitq); + INIT_LIST_HEAD(&local->ta_onwireq); + ret = afr_internal_lock_init(&local->internal_lock, priv->child_count); + if (ret < 0) + goto out; + + ret = -ENOMEM; + local->pre_op_compat = priv->pre_op_compat; + + local->transaction.pre_op = GF_CALLOC(sizeof(*local->transaction.pre_op), + priv->child_count, gf_afr_mt_char); + if (!local->transaction.pre_op) + goto out; + + local->transaction.changelog_xdata = GF_CALLOC( + sizeof(*local->transaction.changelog_xdata), priv->child_count, + gf_afr_mt_dict_t); + if (!local->transaction.changelog_xdata) + goto out; + + if (priv->arbiter_count == 1) { + local->transaction.pre_op_sources = GF_CALLOC( + sizeof(*local->transaction.pre_op_sources), priv->child_count, + gf_afr_mt_char); + if (!local->transaction.pre_op_sources) + goto out; + } + + local->transaction.failed_subvols = GF_CALLOC( + sizeof(*local->transaction.failed_subvols), priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) + goto out; + + local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + ret = 0; +out: + return ret; +} + +void +afr_set_low_priority(call_frame_t *frame) { - int ret = -ENOMEM; + frame->root->pid = LOW_PRIO_PROC_PID; +} - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->locked_nodes) - goto out; +void +afr_priv_destroy(afr_private_t *priv) +{ + int i = 0; + int child_count = -1; - lk->lower_locked_nodes = GF_CALLOC (sizeof (*lk->lower_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->lower_locked_nodes) - goto out; + if (!priv) + goto out; - lk->lock_op_ret = -1; - lk->lock_op_errno = EUCLEAN; - lk->transaction_lk_type = lk_type; + GF_FREE(priv->sh_domain); + GF_FREE(priv->last_event); - ret = 0; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } + if (priv->pending_key) { + for (i = 0; i < child_count; i++) + GF_FREE(priv->pending_key[i]); + } + + GF_FREE(priv->pending_reads); + GF_FREE(priv->local); + GF_FREE(priv->pending_key); + GF_FREE(priv->children); + GF_FREE(priv->anon_inode); + GF_FREE(priv->child_up); + GF_FREE(priv->halo_child_up); + GF_FREE(priv->child_latency); + LOCK_DESTROY(&priv->lock); + + GF_FREE(priv); out: - return ret; + return; } -void -afr_matrix_cleanup (int32_t **matrix, unsigned int m) +int ** +afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending, + dict_t *xattr, ia_type_t iat) { - int i = 0; + int i = 0; + int **changelog = NULL; + int idx = -1; + int m_idx = 0; + int d_idx = 0; + int ret = 0; - if (!matrix) - goto out; - for (i = 0; i < m; i++) { - GF_FREE (matrix[i]); - } + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + idx = afr_index_from_ia_type(iat); - GF_FREE (matrix); + changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + continue; + + changelog[i][m_idx] = hton32(1); + if (idx != -1) + changelog[i][idx] = hton32(1); + /* If the newentry marking is on a newly created directory, + * then mark it with the full-heal indicator. + */ + if ((IA_ISDIR(iat)) && (priv->esh_granular)) + changelog[i][d_idx] = hton32(1); + } + ret = afr_set_pending_dict(priv, xattr, changelog); + if (ret < 0) { + afr_matrix_cleanup(changelog, priv->child_count); + return NULL; + } out: - return; + return changelog; } -int32_t** -afr_matrix_create (unsigned int m, unsigned int n) +static dict_t * +afr_set_heal_info(char *status) { - int32_t **matrix = NULL; - int i = 0; + dict_t *dict = NULL; + int ret = -1; - matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t); - if (!matrix) - goto out; + dict = dict_new(); + if (!dict) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_dynstr_sizen(dict, "heal-info", status); + if (ret) + gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Failed to set heal-info key to " + "%s", + status); +out: + /* Any error other than EINVAL, dict_set_dynstr frees status */ + if (ret == -ENOMEM || ret == -EINVAL) { + GF_FREE(status); + } + + if (ret && dict) { + dict_unref(dict); + dict = NULL; + } + return dict; +} + +static gf_boolean_t +afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type) +{ + afr_private_t *priv = this->private; + int *dirty = alloca0(priv->child_count * sizeof(int)); + int i = 0; + + afr_selfheal_extract_xattr(this, replies, type, dirty, NULL); + for (i = 0; i < priv->child_count; i++) { + if (dirty[i] > 1) + return _gf_true; + } + + return _gf_false; +} + +static gf_boolean_t +afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies, + ia_type_t ia_type) +{ + gf_boolean_t data_chk = _gf_false; + gf_boolean_t mdata_chk = _gf_false; + gf_boolean_t entry_chk = _gf_false; + + switch (ia_type) { + case IA_IFDIR: + mdata_chk = _gf_true; + entry_chk = _gf_true; + break; + case IA_IFREG: + mdata_chk = _gf_true; + data_chk = _gf_true; + break; + default: + /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/ + mdata_chk = _gf_true; + break; + } + + if (data_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_DATA_TRANSACTION)) { + return _gf_true; + } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_METADATA_TRANSACTION)) { + return _gf_true; + } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_ENTRY_TRANSACTION)) { + return _gf_true; + } + + return _gf_false; +} - for (i = 0; i < m; i++) { - matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n, - gf_afr_mt_int32_t); - if (!matrix[i]) - goto out; +static int +afr_update_heal_status(xlator_t *this, struct afr_reply *replies, + ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh, + gf_boolean_t *msh, unsigned char pending) +{ + int ret = -1; + GF_UNUSED int ret1 = 0; + int i = 0; + int io_domain_lk_count = 0; + int shd_domain_lk_count = 0; + afr_private_t *priv = NULL; + char *key1 = NULL; + char *key2 = NULL; + + priv = this->private; + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); + + for (i = 0; i < priv->child_count; i++) { + if ((replies[i].valid != 1) || (replies[i].op_ret != 0)) + continue; + if (!io_domain_lk_count) { + ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count); + } + if (!shd_domain_lk_count) { + ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count); + } + } + + if (!pending) { + if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) || + (!io_domain_lk_count)) { + /* Needs heal. */ + ret = 0; + } else { + /* No heal needed. */ + *dsh = *esh = *msh = 0; } - return matrix; -out: - afr_matrix_cleanup (matrix, m); - return NULL; + } else { + if (shd_domain_lk_count) { + ret = -EAGAIN; /*For 'possibly-healing'. */ + } else { + ret = 0; /*needs heal. Just set a non -ve value so that it is + assumed as the source index.*/ + } + } + return ret; } +/*return EIO, EAGAIN or pending*/ int -afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) -{ - int ret = -ENOMEM; +afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **inode, gf_boolean_t *entry_selfheal, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, unsigned char *pending) +{ + int ret = -1; + int i = 0; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + gf_boolean_t dsh = _gf_false; + gf_boolean_t msh = _gf_false; + gf_boolean_t esh = _gf_false; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *valid_on = NULL; + uint64_t *witness = NULL; + + priv = this->private; + replies = alloca0(sizeof(*replies) * priv->child_count); + sources = alloca0(sizeof(*sources) * priv->child_count); + sinks = alloca0(sizeof(*sinks) * priv->child_count); + witness = alloca0(sizeof(*witness) * priv->child_count); + valid_on = alloca0(sizeof(*valid_on) * priv->child_count); + + ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh, + &esh, replies); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + valid_on[i] = 1; + } + } + if (msh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_METADATA_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } + if (dsh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_DATA_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } + if (esh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_ENTRY_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } - lk->domain = dom; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->locked_nodes) + ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh, + &msh, *pending); +out: + *data_selfheal = dsh; + *entry_selfheal = esh; + *metadata_selfheal = msh; + if (replies) + afr_replies_wipe(replies, priv->child_count); + return ret; +} + +int +afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + unsigned char pending = 0; + dict_t *dict = NULL; + int ret = -1; + int op_errno = ENOMEM; + inode_t *inode = NULL; + char *substr = NULL; + char *status = NULL; + call_frame_t *heal_frame = NULL; + afr_local_t *heal_local = NULL; + + /*Use frame with lk-owner set*/ + heal_frame = afr_frame_create(frame->this, &op_errno); + if (!heal_frame) { + ret = -1; + goto out; + } + heal_local = heal_frame->local; + heal_frame->local = frame->local; + + ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode, + &entry_selfheal, &data_selfheal, + &metadata_selfheal, &pending); + + if (ret == -ENOMEM) { + ret = -1; + goto out; + } + + if (pending & PFLAG_PENDING) { + gf_asprintf(&substr, "-pending"); + if (!substr) + goto out; + } + + if (ret == -EIO) { + ret = gf_asprintf(&status, "split-brain%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } else if (ret == -EAGAIN) { + ret = gf_asprintf(&status, "possibly-healing%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } else if (ret >= 0) { + /* value of ret = source index + * so ret >= 0 and at least one of the 3 booleans set to + * true means a source is identified; heal is required. + */ + if (!data_selfheal && !entry_selfheal && !metadata_selfheal) { + status = gf_strdup("no-heal"); + if (!status) { + ret = -1; goto out; - ret = 0; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } else { + ret = gf_asprintf(&status, "heal%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } + } else if (ret < 0) { + /* Apart from above checked -ve ret values, there are + * other possible ret values like ENOTCONN + * (returned when number of valid replies received are + * less than 2) + * in which case heal is required when one of the + * selfheal booleans is set. + */ + if (data_selfheal || entry_selfheal || metadata_selfheal) { + ret = gf_asprintf(&status, "heal%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } + } + + ret = 0; + op_errno = 0; + out: + if (heal_frame) { + heal_frame->local = heal_local; + AFR_STACK_DESTROY(heal_frame); + } + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + if (dict) + dict_unref(dict); + if (inode) + inode_unref(inode); + GF_FREE(substr); + return ret; +} + +int +_afr_is_split_brain(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, afr_transaction_type type, + gf_boolean_t *spb) +{ + afr_private_t *priv = NULL; + uint64_t *witness = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + int sources_count = 0; + int ret = 0; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + witness = alloca0(priv->child_count * sizeof(*witness)); + + ret = afr_selfheal_find_direction(frame, this, replies, type, + priv->child_up, sources, sinks, witness, + NULL); + if (ret) return ret; + + sources_count = AFR_COUNT(sources, priv->child_count); + if (!sources_count) + *spb = _gf_true; + + return ret; } int -afr_transaction_local_init (afr_local_t *local, xlator_t *this) +afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb) { - int child_up_count = 0; - int ret = -ENOMEM; - afr_private_t *priv = NULL; + int ret = -1; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; - priv = this->private; - ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, - AFR_TRANSACTION_LK); - if (ret < 0) - goto out; + priv = this->private; - if ((local->transaction.type == AFR_DATA_TRANSACTION) || - (local->transaction.type == AFR_METADATA_TRANSACTION)) { - ret = afr_inodelk_init (&local->internal_lock.inodelk[0], - this->name, priv->child_count); - if (ret < 0) - goto out; - } - - ret = -ENOMEM; - child_up_count = afr_up_children_count (local->child_up, - priv->child_count); - if (priv->optimistic_change_log && child_up_count == priv->child_count) - local->optimistic_change_log = 1; + replies = alloca0(sizeof(*replies) * priv->child_count); - local->first_up_child = afr_first_up_child (local->child_up, - priv->child_count); + ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies); + if (ret) + goto out; - local->transaction.eager_lock = - GF_CALLOC (sizeof (*local->transaction.eager_lock), - priv->child_count, - gf_afr_mt_int32_t); + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + ret = -EAGAIN; + goto out; + } - if (!local->transaction.eager_lock) - goto out; + ret = _afr_is_split_brain(frame, this, replies, AFR_DATA_TRANSACTION, + d_spb); + if (ret) + goto out; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; + ret = _afr_is_split_brain(frame, this, replies, AFR_METADATA_TRANSACTION, + m_spb); +out: + if (replies) { + afr_replies_wipe(replies, priv->child_count); + replies = NULL; + } + return ret; +} - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), - priv->child_count, - gf_afr_mt_char); - if (!local->transaction.pre_op) - goto out; +int +afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque) +{ + GF_FREE(opaque); + return 0; +} - local->pending = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!local->pending) - goto out; +int +afr_get_split_brain_status(void *opaque) +{ + gf_boolean_t d_spb = _gf_false; + gf_boolean_t m_spb = _gf_false; + int ret = -1; + int op_errno = 0; + int i = 0; + char *choices = NULL; + char *status = NULL; + dict_t *dict = NULL; + inode_t *inode = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + afr_spb_status_t *data = NULL; + + data = opaque; + frame = data->frame; + this = frame->this; + loc = data->loc; + priv = this->private; + children = priv->children; + + inode = afr_inode_find(this, loc->gfid); + if (!inode) + goto out; + + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + + /* Calculation for string length : + * (child_count X length of child-name) + SLEN(" Choices :") + * child-name consists of : + * a) 251 = max characters for volname according to GD_VOLUME_NAME_MAX + * b) strlen("-client-00,") assuming 16 replicas + */ + choices = alloca0(priv->child_count * (256 + SLEN("-client-00,")) + + SLEN(" Choices:")); + + ret = afr_is_split_brain(frame, this, inode, loc->gfid, &d_spb, &m_spb); + if (ret) { + op_errno = -ret; + if (ret == -EAGAIN) { + ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS, + SBRAIN_HEAL_NO_GO_MSG); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + AFR_MSG_DICT_SET_FAILED, + "Failed to set GF_AFR_SBRAIN_STATUS in dict"); + } + } + ret = -1; + goto out; + } + + if (d_spb || m_spb) { + sprintf(choices, " Choices:"); + for (i = 0; i < priv->child_count; i++) { + strcat(choices, children[i]->name); + strcat(choices, ","); + } + choices[strlen(choices) - 1] = '\0'; - local->transaction.txn_changelog = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!local->transaction.txn_changelog) - goto out; + ret = gf_asprintf(&status, + "data-split-brain:%s " + "metadata-split-brain:%s%s", + (d_spb) ? "yes" : "no", (m_spb) ? "yes" : "no", + choices); - INIT_LIST_HEAD (&local->transaction.eager_locked); + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + ret = dict_set_dynstr_sizen(dict, GF_AFR_SBRAIN_STATUS, status); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + } else { + ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS, + SFILE_NOT_UNDER_DATA); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + } - ret = 0; + ret = 0; out: - return ret; + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + if (dict) + dict_unref(dict); + if (inode) + inode_unref(inode); + return ret; } -void -afr_reset_children (int32_t *fresh_children, int32_t child_count) -{ - unsigned int i = 0; - for (i = 0; i < child_count; i++) - fresh_children[i] = -1; +int32_t +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret = 0; + int op_errno = 0; + dict_t *dict = NULL; + afr_local_t *local = NULL; + afr_local_t *heal_local = NULL; + call_frame_t *heal_frame = NULL; + + local = frame->local; + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + + heal_frame = afr_frame_create(this, &op_errno); + if (!heal_frame) { + ret = -1; + goto out; + } + heal_local = heal_frame->local; + heal_frame->local = frame->local; + /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk + * work correctly*/ + ret = afr_selfheal_do(heal_frame, this, loc->gfid); + + if (ret == 1 || ret == 2) { + ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg", + SFILE_NOT_IN_SPLIT_BRAIN); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Failed to set sh-fail-msg in dict"); + ret = 0; + goto out; + } else { + if (local->xdata_rsp) { + /* 'sh-fail-msg' has been set in the dict during self-heal.*/ + dict_copy(local->xdata_rsp, dict); + ret = 0; + } else if (ret < 0) { + op_errno = -ret; + ret = -1; + } + } + +out: + if (heal_frame) { + heal_frame->local = heal_local; + AFR_STACK_DESTROY(heal_frame); + } + if (local->op == GF_FOP_GETXATTR) + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + else if (local->op == GF_FOP_SETXATTR) + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + if (dict) + dict_unref(dict); + return ret; } -int32_t* -afr_children_create (int32_t child_count) +int +afr_get_child_index_from_name(xlator_t *this, char *name) { - int32_t *children = NULL; - int i = 0; - - GF_ASSERT (child_count > 0); + afr_private_t *priv = this->private; + int index = -1; - children = GF_CALLOC (child_count, sizeof (*children), - gf_afr_mt_int32_t); - if (NULL == children) - goto out; - for (i = 0; i < child_count; i++) - children[i] = -1; + for (index = 0; index < priv->child_count; index++) { + if (!strcmp(priv->children[index]->name, name)) + goto out; + } + index = -1; out: - return children; + return index; } void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count) -{ - gf_boolean_t child_found = _gf_false; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - child_found = _gf_true; - break; - } - } - - if (!child_found) { - GF_ASSERT (i < child_count); - children[i] = child; - } +afr_priv_need_heal_set(afr_private_t *priv, gf_boolean_t need_heal) +{ + LOCK(&priv->lock); + { + priv->need_heal = need_heal; + } + UNLOCK(&priv->lock); } void -afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count) -{ - int i = 0; - - GF_ASSERT ((child >= 0) && (child < child_count)); - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - if (i != (child_count - 1)) - memmove (children + i, children + i + 1, - sizeof (*children)*(child_count - i - 1)); - children[child_count - 1] = -1; - break; - } +afr_set_need_heal(xlator_t *this, afr_local_t *local) +{ + int i = 0; + afr_private_t *priv = this->private; + gf_boolean_t need_heal = _gf_false; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].need_heal) { + need_heal = _gf_true; + break; } + } + afr_priv_need_heal_set(priv, need_heal); + return; } -int -afr_get_children_count (int32_t *children, unsigned int child_count) +gf_boolean_t +afr_get_need_heal(xlator_t *this) { - int count = 0; - int i = 0; + afr_private_t *priv = this->private; + gf_boolean_t need_heal = _gf_true; - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - count++; - } - return count; + LOCK(&priv->lock); + { + need_heal = priv->need_heal; + } + UNLOCK(&priv->lock); + return need_heal; } -void -afr_set_low_priority (call_frame_t *frame) +int +afr_get_msg_id(char *op_type) { - frame->root->pid = LOW_PRIO_PROC_PID; + if (!strcmp(op_type, GF_AFR_REPLACE_BRICK)) + return AFR_MSG_REPLACE_BRICK_STATUS; + else if (!strcmp(op_type, GF_AFR_ADD_BRICK)) + return AFR_MSG_ADD_BRICK_STATUS; + return -1; } int -afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, - int flags) +afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *heal_frame, + void *opaque) { - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; + call_frame_t *txn_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *heal_local = NULL; + xlator_t *this = NULL; - GF_ASSERT (fd && fd->inode); - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", fd); - goto out; + heal_local = heal_frame->local; + txn_frame = heal_local->heal_frame; + local = txn_frame->local; + this = txn_frame->this; + + /* Refresh the inode agan and proceed with the transaction.*/ + afr_inode_refresh(txn_frame, this, local->inode, NULL, local->refreshfn); + + AFR_STACK_DESTROY(heal_frame); + + return 0; +} + +int +afr_fav_child_reset_sink_xattrs(void *opaque) +{ + call_frame_t *heal_frame = NULL; + call_frame_t *txn_frame = NULL; + xlator_t *this = NULL; + gf_boolean_t d_spb = _gf_false; + gf_boolean_t m_spb = _gf_false; + afr_local_t *heal_local = NULL; + afr_local_t *txn_local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; + unsigned char *locked_on = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + int ret = 0; + + heal_frame = (call_frame_t *)opaque; + heal_local = heal_frame->local; + txn_frame = heal_local->heal_frame; + txn_local = txn_frame->local; + this = txn_frame->this; + inode = txn_local->inode; + priv = this->private; + locked_on = alloca0(priv->child_count); + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = _afr_is_split_brain(txn_frame, this, txn_local->replies, + AFR_DATA_TRANSACTION, &d_spb); + + ret = _afr_is_split_brain(txn_frame, this, txn_local->replies, + AFR_METADATA_TRANSACTION, &m_spb); + + /* Take appropriate locks and reset sink xattrs. */ + if (d_spb) { + ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0, + locked_on); + { + if (ret < priv->child_count) + goto data_unlock; + ret = __afr_selfheal_data_prepare( + heal_frame, this, inode, locked_on, sources, sinks, + healed_sinks, undid_pending, locked_replies, NULL); + } + data_unlock: + afr_selfheal_uninodelk(heal_frame, this, inode, this->name, 0, 0, + locked_on); + } + + if (m_spb) { + memset(locked_on, 0, sizeof(*locked_on) * priv->child_count); + memset(undid_pending, 0, sizeof(*undid_pending) * priv->child_count); + ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, + LLONG_MAX - 1, 0, locked_on); + { + if (ret < priv->child_count) + goto mdata_unlock; + ret = __afr_selfheal_metadata_prepare( + heal_frame, this, inode, locked_on, sources, sinks, + healed_sinks, undid_pending, locked_replies, NULL); } + mdata_unlock: + afr_selfheal_uninodelk(heal_frame, this, inode, this->name, + LLONG_MAX - 1, 0, locked_on); + } - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); + return ret; +} + +/* + * Concatenates the xattrs in local->replies separated by a delimiter. + */ +int +afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this, + char *buf, const char *default_str, + int32_t *serz_len, char delimiter) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *xattr = NULL; + int i = 0; + int len = 0; + int keylen = 0; + size_t str_len = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + + keylen = strlen(local->cont.getxattr.name); + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret) { + str_len = strlen(default_str); + buf = strncat(buf, default_str, str_len); + len += str_len; + buf[len++] = delimiter; + buf[len] = '\0'; + } else { + ret = dict_get_strn(local->replies[i].xattr, + local->cont.getxattr.name, keylen, &xattr); + if (ret) { + gf_msg("TEST", GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "Failed to get the node_uuid of brick " + "%d", + i); goto out; - } + } + str_len = strlen(xattr); + buf = strncat(buf, xattr, str_len); + len += str_len; + buf[len++] = delimiter; + buf[len] = '\0'; + } + } + buf[--len] = '\0'; /*remove the last delimiter*/ + if (serz_len) + *serz_len = ++len; + ret = 0; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - fd_ctx->opened_on[child] = AFR_FD_OPENED; - if (!IA_ISDIR (fd->inode->ia_type)) { - fd_ctx->flags = flags; - } - ret = 0; out: - return ret; + return ret; } -gf_boolean_t -afr_have_quorum (char *logname, afr_private_t *priv) +uint64_t +afr_write_subvol_get(call_frame_t *frame, xlator_t *this) { - unsigned int quorum = 0; + afr_local_t *local = NULL; + uint64_t write_subvol = 0; - GF_VALIDATE_OR_GOTO(logname,priv,out); + local = frame->local; + LOCK(&local->inode->lock); + write_subvol = local->inode_ctx->write_subvol; + UNLOCK(&local->inode->lock); - quorum = priv->quorum_count; - if (quorum != AFR_QUORUM_AUTO) { - return (priv->up_count >= (priv->down_count + quorum)); - } + return write_subvol; +} - quorum = priv->child_count / 2 + 1; - if (priv->up_count >= (priv->down_count + quorum)) { - return _gf_true; - } +int +afr_write_subvol_set(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int event = 0; + int i = 0; + + local = frame->local; + priv = this->private; + data_accused = alloca0(priv->child_count); + metadata_accused = alloca0(priv->child_count); + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + event = local->event_generation; + + afr_readables_fill(frame, this, local->inode, data_accused, + metadata_accused, data_readable, metadata_readable, + NULL); + + for (i = 0; i < priv->child_count; i++) { + if (data_readable[i]) + datamap |= (1 << i); + if (metadata_readable[i]) + metadatamap |= (1 << i); + } + + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); + + LOCK(&local->inode->lock); + { + if (local->inode_ctx->write_subvol == 0 && + local->transaction.type == AFR_DATA_TRANSACTION) { + local->inode_ctx->write_subvol = val; + } + } + UNLOCK(&local->inode->lock); + + return 0; +} - /* - * Special case for even numbers of nodes: if we have exactly half - * and that includes the first ("senior-most") node, then that counts - * as quorum even if it wouldn't otherwise. This supports e.g. N=2 - * while preserving the critical property that there can only be one - * such group. - */ - if ((priv->child_count % 2) == 0) { - quorum = priv->child_count / 2; - if (priv->up_count >= (priv->down_count + quorum)) { - if (priv->child_up[0]) { - return _gf_true; - } - } - } +int +afr_write_subvol_reset(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; -out: - return _gf_false; + local = frame->local; + LOCK(&local->inode->lock); + { + GF_ASSERT(local->inode_ctx->lock_count > 0); + local->inode_ctx->lock_count--; + + if (!local->inode_ctx->lock_count) + local->inode_ctx->write_subvol = 0; + } + UNLOCK(&local->inode->lock); + + return 0; } -void -afr_priv_destroy (afr_private_t *priv) +int +afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode) { - int i = 0; + int ret = 0; - if (!priv) - goto out; - inode_unref (priv->root_inode); - GF_FREE (priv->shd.pos); - GF_FREE (priv->shd.pending); - GF_FREE (priv->shd.inprogress); -// for (i = 0; i < priv->child_count; i++) -// if (priv->shd.timer && priv->shd.timer[i]) -// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); - GF_FREE (priv->shd.timer); - - if (priv->shd.healed) - eh_destroy (priv->shd.healed); - - if (priv->shd.heal_failed) - eh_destroy (priv->shd.heal_failed); - - if (priv->shd.split_brain) - eh_destroy (priv->shd.split_brain); - - GF_FREE (priv->last_event); - if (priv->pending_key) { - for (i = 0; i < priv->child_count; i++) - GF_FREE (priv->pending_key[i]); - } - GF_FREE (priv->pending_key); - GF_FREE (priv->children); - GF_FREE (priv->child_up); - LOCK_DESTROY (&priv->lock); - LOCK_DESTROY (&priv->read_child_lock); - pthread_mutex_destroy (&priv->mutex); - GF_FREE (priv); + local->inode = inode_ref(inode); + LOCK(&local->inode->lock); + { + ret = __afr_inode_ctx_get(this, local->inode, &local->inode_ctx); + } + UNLOCK(&local->inode->lock); + if (ret < 0) { + gf_msg_callingfn( + this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_INODE_CTX_GET_FAILED, + "Error getting inode ctx %s", uuid_utoa(local->inode->gfid)); + } + return ret; +} + +gf_boolean_t +afr_ta_is_fop_called_from_synctask(xlator_t *this) +{ + struct synctask *task = NULL; + gf_lkowner_t tmp_owner = { + 0, + }; + + task = synctask_get(); + if (!task) + return _gf_false; + + set_lk_owner_from_ptr(&tmp_owner, (void *)this); + + if (!is_same_lkowner(&tmp_owner, &task->frame->root->lk_owner)) + return _gf_false; + + return _gf_true; +} + +int +afr_ta_post_op_lock(xlator_t *this, loc_t *loc) +{ + int ret = 0; + uuid_t gfid = { + 0, + }; + afr_private_t *priv = this->private; + gf_boolean_t locked = _gf_false; + struct gf_flock flock1 = { + 0, + }; + struct gf_flock flock2 = { + 0, + }; + int32_t cmd = 0; + + /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock + * has been released in afr_notify due to upcall notification from shd. + */ + GF_ASSERT(priv->ta_notify_dom_lock_offset == 0); + + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + flock1.l_type = F_WRLCK; + + while (!locked) { + if (priv->shd.iamshd) { + cmd = F_SETLKW; + flock1.l_start = 0; + flock1.l_len = 0; + } else { + cmd = F_SETLK; + gf_uuid_generate(gfid); + flock1.l_start = gfid_to_ino(gfid); + if (flock1.l_start < 0) + flock1.l_start = -flock1.l_start; + flock1.l_len = 1; + } + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, cmd, &flock1, NULL, NULL); + if (!ret) { + locked = _gf_true; + priv->ta_notify_dom_lock_offset = flock1.l_start; + } else if (ret == -EAGAIN) { + continue; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to get " + "AFR_TA_DOM_NOTIFY lock on %s.", + loc->name); + goto out; + } + } + + flock2.l_type = F_WRLCK; + flock2.l_start = 0; + flock2.l_len = 0; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name); + flock1.l_type = F_UNLCK; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL, + NULL); + } out: - return; + return ret; } int -xlator_subvolume_count (xlator_t *this) +afr_ta_post_op_unlock(xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = { + 0, + }; + int ret = 0; + + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + flock.l_type = F_UNLCK; + flock.l_start = 0; + flock.l_len = 0; + + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_MODIFY lock."); + goto out; + } + + if (!priv->shd.iamshd) + /* Mounts (clients) will not release the AFR_TA_DOM_NOTIFY lock + * in post-op as they use it as a notification mechanism. When + * shd sends a lock request on TA during heal, the clients will + * receive a lock-contention upcall notification upon which they + * will release the AFR_TA_DOM_NOTIFY lock after completing the + * in flight I/O.*/ + goto out; + + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_NOTIFY lock."); + } +out: + return ret; +} + +call_frame_t * +afr_ta_frame_create(xlator_t *this) { - int i = 0; - xlator_list_t *list = NULL; + call_frame_t *frame = NULL; + void *lk_owner = NULL; - for (list = this->children; list; list = list->next) - i++; - return i; + frame = create_frame(this, this->ctx->pool); + if (!frame) + return NULL; + lk_owner = (void *)this; + afr_set_lk_owner(frame, this, lk_owner); + return frame; } -inline gf_boolean_t -afr_is_errno_set (int *child_errno, int child) +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local) { - return child_errno[child]; + int data_count = 0; + + data_count = AFR_COUNT(local->child_up, priv->child_count); + if (data_count == 2) { + return _gf_true; + } else if (data_count == 1 && local->ta_child_up) { + return _gf_true; + } + + return _gf_false; } -inline gf_boolean_t -afr_is_errno_unset (int *child_errno, int child) +static gf_boolean_t +afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame) { - return !afr_is_errno_set (child_errno, child); -} + afr_local_t *local = NULL; -void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count) -{ - int midx = 0; - int idx = 0; - int i = 0; - - midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - if (IA_ISDIR (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else if (IA_ISREG (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - else - idx = -1; - for (i = 0; i < child_count; i++) { - if (is_pending (ctx, i)) { - pending[i][midx] = hton32 (1); - if (idx == -1) - continue; - pending[i][idx] = hton32 (1); - } - } + if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) + return _gf_false; + + local = frame->local; + + if (local->op != GF_FOP_LOOKUP) + /* TODO:If the replica count is being increased on a plain distribute + * volume that was never mounted, we need to allow setxattr on '/' with + * GF_CLIENT_PID_NO_ROOT_SQUASH to accomodate for DHT layout setting */ + return _gf_false; + + if (local->inode == NULL) + return _gf_false; + + if (!__is_root_gfid(local->inode->gfid)) + return _gf_false; + + return _gf_true; } gf_boolean_t -afr_is_fd_fixable (fd_t *fd) +afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count) { - if (!fd || !fd->inode) - return _gf_false; - else if (fd_is_anonymous (fd)) - return _gf_false; - else if (uuid_is_null (fd->inode->gfid)) - return _gf_false; - + if (frame && (up_children_count > 0) && + afr_is_add_replica_mount_lookup_on_root(frame)) return _gf_true; + + return _gf_false; } void -afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - inode_t *inode = NULL; - afr_inode_ctx_t *ctx = NULL; - - local = frame->local; + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + unsigned char *success_replies = NULL; - if (local->fd) - inode = local->fd->inode; - else - inode = local->loc.inode; + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); - if (!inode) - return; + if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) { + local->op_errno = afr_final_errno(local, priv); + if (!local->op_errno) + local->op_errno = afr_quorum_errno(priv); + local->op_ret = -1; + } +} - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - ctx->open_fd_count = local->open_fd_count; +gf_boolean_t +afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child) +{ + int *pending = NULL; + int ret = 0; + int i = 0; + + ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending); + if (ret == 0) { + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + /* Not doing a ntoh32(pending) as we just want to check + * if it is non-zero or not. */ + if (pending[i]) { + return _gf_true; + } } - UNLOCK (&inode->lock); + } + + return _gf_false; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 689dd84e646..f8bf8340dab 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -8,538 +8,339 @@ cases as published by the Free Software Foundation. */ - #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> #include <string.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "checksum.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/dict.h> +#include <glusterfs/list.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> #include "afr.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno, int32_t sh_failed) -{ - afr_local_t *local = NULL; - - local = frame->local; - - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - - return 0; -} - - -gf_boolean_t -__checksums_differ (uint32_t *checksum, int child_count, - unsigned char *child_up) -{ - int ret = _gf_false; - int i = 0; - uint32_t cksum = 0; - gf_boolean_t activate_check = _gf_false; - - for (i = 0; i < child_count; i++) { - if (!child_up[i]) - continue; - if (_gf_false == activate_check) { - cksum = checksum[i]; - activate_check = _gf_true; - continue; - } - - if (cksum != checksum[i]) { - ret = _gf_true; - break; - } - - cksum = checksum[i]; - } - - return ret; -} - +#include "afr-transaction.h" int32_t -afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) +afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - char *reason = NULL; - int child_index = 0; - uint32_t entry_cksum = 0; - int call_count = 0; - off_t last_offset = 0; - inode_t *inode = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - inode = local->fd->inode; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to do opendir on %s", - local->loc.path, priv->children[child_index]->name); - local->op_ret = -1; - local->op_ret = op_errno; - goto out; - } + afr_local_t *local = NULL; + int call_count = -1; + int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: no entries found in %s", - local->loc.path, priv->children[child_index]->name); - goto out; - } + local = frame->local; + fd_ctx = local->fd_ctx; + child_index = (long)cookie; - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, - strlen (entry->d_name)); - local->cont.opendir.checksum[child_index] ^= entry_cksum; - } + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - /* read more entries */ - - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset, NULL); + if (call_count == 0) { + afr_handle_replies_quorum(frame, this); + AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno, + local->fd, NULL); + } - return 0; - -out: - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (__checksums_differ (local->cont.opendir.checksum, - priv->child_count, - local->child_up)) { - - sh->do_entry_self_heal = _gf_true; - sh->forced_merge = _gf_true; - - reason = "checksums of directory differ"; - afr_launch_self_heal (frame, this, inode, _gf_false, - inode->ia_type, reason, NULL, - afr_examine_dir_sh_unwind); - } else { - afr_set_opendir_done (this, inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - } - } - - return 0; + return 0; } - int -afr_examine_dir (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - priv = this->private; - - local->cont.opendir.checksum = GF_CALLOC (priv->child_count, - sizeof (*local->cont.opendir.checksum), - gf_afr_mt_int32_t); - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->readdir, - local->fd, 131072, 0, NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int32_t -afr_opendir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd, dict_t *xdata) +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int32_t up_children_count = 0; - int ret = -1; - int call_count = -1; - int32_t child_index = 0; - - priv = this->private; - local = frame->local; - child_index = (long) cookie; - - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - ret = afr_child_fd_ctx_set (this, fd, child_index, 0); - if (ret) { - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - } - - local->op_errno = op_errno; - } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->op_ret != 0) - goto out; - - if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1 && priv->entry_self_heal) { - - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anomalies). - */ - - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); - - afr_examine_dir (frame, this); - - } else { - /* do the unwind */ - goto out; - } - } - - return 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = -1; + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; -out: - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - - return 0; -} + priv = this->private; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int child_count = 0; - int i = 0; - int ret = -1; - int call_count = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->op = GF_FOP_OPENDIR; - priv = this->private; + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } - child_count = priv->child_count; + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + loc_copy(&local->loc, loc); - loc_copy (&local->loc, loc); + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; - local->fd = fd_ref (fd); + call_count = local->call_count; - call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_opendir_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->opendir, loc, fd, NULL); - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_opendir_cbk, - (void*) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - loc, fd, NULL); - - if (!--call_count) - break; - } + if (!--call_count) + break; } + } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); - - return 0; + AFR_STACK_UNWIND(opendir, frame, -1, op_errno, fd, NULL); + return 0; } +static int +afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) +{ + int gen = 0; + int entry_read_subvol = 0; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + + afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable, + &gen); + + if (gen != priv->event_generation || !data_readable[par_read_subvol] || + !metadata_readable[par_read_subvol]) + return -1; + + /* Once the control reaches the following statement, it means that the + * parent's read subvol is perfectly readable. So calling + * either afr_data_subvol_get() or afr_metadata_subvol_get() would + * yield the same result. Hence, choosing afr_data_subvol_get() below. + */ + + if (!priv->consistent_metadata) + return 0; -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: readdir - */ - - -struct entry_name { - char *name; - struct list_head list; -}; + /* For an inode fetched through readdirp which is yet to be linked, + * inode ctx would not be initialised (yet). So this function returns + * -1 above due to gen being 0, which is why it is OK to pass NULL for + * read_subvol_args here. + */ + entry_read_subvol = afr_data_subvol_get(inode, this, NULL, NULL, NULL, + NULL); + if (entry_read_subvol != par_read_subvol) + return -1; + + return 0; +} static void -afr_forget_entries (fd_t *fd) +afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) { - struct entry_name *entry = NULL; - struct entry_name *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; + int ret = -1; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + gf_boolean_t need_heal = _gf_false; + gf_boolean_t validate_subvol = _gf_false; + + this = THIS; + priv = this->private; + + need_heal = afr_get_need_heal(this); + validate_subvol = need_heal | priv->consistent_metadata; + + list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list) + { + if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name, + frame->root->pid)) { + continue; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + list_del_init(&entry->list); + list_add_tail(&entry->list, &entries->list); - list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { - GF_FREE (entry->name); - list_del (&entry->list); - GF_FREE (entry); - } -} + if (!validate_subvol) + continue; -static void -afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) -{ - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if (__is_root_gfid (fd->inode->gfid) && - !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } + if (entry->inode) { + ret = afr_validate_read_subvol(entry->inode, this, subvol); + if (ret == -1) { + inode_unref(entry->inode); + entry->inode = NULL; + continue; + } } + } } int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) +afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + gf_dirent_t entries; - if (op_ret == -1) - goto out; + INIT_LIST_HEAD(&entries.list); - local = frame->local; - afr_readdir_filter_trash_dir (entries, local->fd); + local = frame->local; -out: - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; + + afr_read_txn_continue(frame, this, (long)cookie); return 0; -} + } + if (op_ret >= 0) + afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, + &entries, local->fd); -int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - afr_local_t *local = NULL; + AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); - if (op_ret == -1) - goto out; + gf_dirent_free(&entries); - local = frame->local; - afr_readdir_filter_trash_dir (entries, local->fd); + return 0; +} -out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); +int +afr_readdir_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + priv = this->private; + local = frame->local; + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + local->op_errno = EINVAL; + local->op_ret = -1; + } + + if (subvol == -1 || !fd_ctx) { + AFR_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, 0, 0); return 0; + } + + fd_ctx->readdir_subvol = subvol; + + if (local->op == GF_FOP_READDIR) + STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, local->fd, + local->cont.readdir.size, local->cont.readdir.offset, + local->xdata_req); + else + STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, local->fd, + local->cont.readdir.size, local->cont.readdir.offset, + local->xdata_req); + return 0; } -int32_t -afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) +int +afr_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - children = priv->children; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readdir.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - op_errno = EBADF; - goto out; - } - - if ((offset == 0) || (fd_ctx->call_child == -1)) { - fd_ctx->call_child = call_child; - } else if ((priv->readdir_failover == _gf_false) && - (call_child != fd_ctx->call_child)) { - op_errno = EBADF; - goto out; - } - - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL; - - if (whichop == GF_FOP_READDIR) - STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdir, fd, - size, offset, dict); - else - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, fd, - size, offset, dict); - - return 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; + afr_fd_ctx_t *fd_ctx = NULL; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) { + op_errno = EINVAL; + goto out; + } + + local->op = whichop; + local->fd = fd_ref(fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict) ? dict_ref(dict) : NULL; + + subvol = fd_ctx->readdir_subvol; + + if (offset == 0 || subvol == -1) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn(frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_readdir_wind(frame, this, subvol); + } + + return 0; out: - AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); - return 0; + AFR_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL); + return 0; } - int32_t -afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *xdata) +afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); - return 0; -} + afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; +} int32_t -afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *dict) +afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); - return 0; -} + afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIRP, dict); + return 0; +} int32_t -afr_releasedir (xlator_t *this, fd_t *fd) +afr_releasedir(xlator_t *this, fd_t *fd) { - afr_forget_entries (fd); - afr_cleanup_fd_ctx (this, fd); + afr_cleanup_fd_ctx(this, fd); - return 0; + return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 09456d15949..773e925ec6c 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -11,26 +11,23 @@ #ifndef __DIR_READ_H__ #define __DIR_READ_H__ - int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata); +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); int32_t -afr_releasedir (xlator_t *this, fd_t *fd); +afr_releasedir(xlator_t *this, fd_t *fd); int32_t -afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, dict_t *xdata); - +afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata); int32_t -afr_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, dict_t *dict); +afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict); int32_t -afr_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, dict_t *xdata); - +afr_checksum(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata); #endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 1943b719bb5..b7cceb79158 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -8,500 +8,493 @@ cases as published by the Free Software Foundation. */ - #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/list.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> #include "afr.h" #include "afr-transaction.h" +void +afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this); + int -afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) +afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno) { - int ret = -1; - char *child_path = NULL; - - if (!child->parent) { - if (op_errno) - *op_errno = EINVAL; - goto out; - } - - child_path = gf_strdup (child->path); - if (!child_path) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } - parent->path = gf_strdup( dirname (child_path) ); - if (!parent->path) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } - parent->inode = inode_ref (child->parent); - uuid_copy (parent->gfid, child->pargfid); - - ret = 0; + int ret = -1; + char *child_path = NULL; + + if (!child->parent) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + child_path = gf_strdup(child->path); + if (!child_path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + parent->path = gf_strdup(dirname(child_path)); + if (!parent->path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + parent->inode = inode_ref(child->parent); + gf_uuid_copy(parent->gfid, child->pargfid); + + ret = 0; out: - GF_FREE(child_path); + GF_FREE(child_path); - return ret; + return ret; } -void -__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, struct iatt *prenewparent, - struct iatt *postnewparent) +static void +__afr_dir_write_finalize(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - - local = frame->local; - - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret > -1) { - local->op_ret = op_ret; - - if ((local->success_count == 0) || - (child_index == local->read_child_index)) { - local->cont.dir_fop.preparent = *preparent; - local->cont.dir_fop.postparent = *postparent; - if (buf) - local->cont.dir_fop.buf = *buf; - if (prenewparent) - local->cont.dir_fop.prenewparent = *prenewparent; - if (postnewparent) - local->cont.dir_fop.postnewparent = *postnewparent; - } - - local->cont.dir_fop.inode = inode; - - local->fresh_children[local->success_count] = child_index; - local->success_count++; - local->child_errno[child_index] = 0; - } else { - local->child_errno[child_index] = op_errno; - } - - local->op_errno = op_errno; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int inode_read_subvol = -1; + int parent_read_subvol = -1; + int parent2_read_subvol = -1; + int i = 0; + afr_read_subvol_args_t args = { + 0, + }; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == -1) + continue; + gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid); + args.ia_type = local->replies[i].poststat.ia_type; + break; + } + + if (local->inode) { + if (local->op != GF_FOP_RENAME && local->op != GF_FOP_LINK) + afr_replies_interpret(frame, this, local->inode, NULL); + + inode_read_subvol = afr_data_subvol_get(local->inode, this, NULL, NULL, + NULL, &args); + } + + if (local->parent) + parent_read_subvol = afr_data_subvol_get(local->parent, this, NULL, + local->readable, NULL, NULL); + + if (local->parent2) + parent2_read_subvol = afr_data_subvol_get(local->parent2, this, NULL, + local->readable2, NULL, NULL); + + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + afr_pick_error_xdata(local, priv, local->parent, local->readable, + local->parent2, local->readable2); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) + afr_inode_need_refresh_set(local->inode, this); + if (local->parent) + afr_inode_need_refresh_set(local->parent, this); + if (local->parent2) + afr_inode_need_refresh_set(local->parent2, this); + continue; + } + + if (local->op_ret == -1) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.dir_fop.buf = local->replies[i].poststat; + local->cont.dir_fop.preparent = local->replies[i].preparent; + local->cont.dir_fop.postparent = local->replies[i].postparent; + local->cont.dir_fop.prenewparent = local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = local->replies[i].postparent2; + if (local->xdata_rsp) { + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + } + + if (local->replies[i].xdata) + local->xdata_rsp = dict_ref(local->replies[i].xdata); + continue; + } + + if (i == inode_read_subvol) { + local->cont.dir_fop.buf = local->replies[i].poststat; + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + } + + if (i == parent_read_subvol) { + local->cont.dir_fop.preparent = local->replies[i].preparent; + local->cont.dir_fop.postparent = local->replies[i].postparent; + } + + if (i == parent2_read_subvol) { + local->cont.dir_fop.prenewparent = local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = local->replies[i].postparent2; + } + } } -int -afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) +static void +__afr_dir_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *poststat, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) { - int call_count = 0; - - call_count = afr_frame_return (frame); - if (call_count == 0) { - AFR_STACK_DESTROY (frame); - } - return 0; + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + + if (op_ret >= 0) { + if (poststat) + local->replies[child_index].poststat = *poststat; + if (preparent) + local->replies[child_index].preparent = *preparent; + if (postparent) + local->replies[child_index].postparent = *postparent; + if (preparent2) + local->replies[child_index].preparent2 = *preparent2; + if (postparent2) + local->replies[child_index].postparent2 = *postparent2; + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + if (op_errno != ENOTEMPTY) + afr_transaction_fop_failed(frame, this, child_index); + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + + return; } -void -afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) +static int +__afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) { - call_frame_t *new_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *new_local = NULL; - afr_private_t *priv = NULL; - dict_t **xattr = NULL; - int32_t **changelog = NULL; - int i = 0; - GF_UNUSED int op_errno = 0; - - local = frame->local; - priv = this->private; - - new_frame = copy_frame (frame); - if (!new_frame) { - goto out; - } + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_private_t *priv = NULL; - AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); - new_local = new_frame->local; - changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); - if (!changelog) - goto out; - - xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), - gf_afr_mt_dict_t); - if (!xattr) - goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->child_errno[i]) - continue; - xattr[i] = dict_new (); - if (!xattr[i]) - goto out; - } + priv = this->private; + local = frame->local; - afr_prepare_new_entry_pending_matrix (changelog, - afr_is_errno_set, - local->child_errno, - &local->cont.dir_fop.buf, - priv->child_count); - - new_local->pending = changelog; - uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); - new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); - new_local->call_count = local->success_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_errno[i]) - continue; - - afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST); - STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->xattrop, - &new_local->loc, GF_XATTROP_ADD_ARRAY, - xattr[i], NULL); - } - new_frame = NULL; -out: - if (new_frame) - AFR_STACK_DESTROY (new_frame); - afr_xattr_array_destroy (xattr, priv->child_count); - return; -} + LOCK(&frame->lock); + { + __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf, + preparent, postparent, preparent2, postparent2, + xdata); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); -gf_boolean_t -afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) -{ - glusterfs_fop_t fops[] = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; - int i = 0; + if (call_count == 0) { + __afr_dir_write_finalize(frame, this); - for (i = 0; fops[i] != GF_FOP_NULL; i++) { - if (fop == fops[i]) - return _gf_true; + if (afr_txn_nothing_failed(frame, this)) { + /*if it did pre-op, it will do post-op changing ctime*/ + if (priv->consistent_metadata && afr_needs_changelog_update(local)) + afr_zero_fill_stat(local); + local->transaction.unwind(frame, this); } - return _gf_false; -} -void -afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_mark_entry_pending_changelog(frame, this); - local = frame->local; - priv = this->private; + afr_transaction_resume(frame, this); + } - if (local->op_ret < 0) - goto out; + return 0; +} - if (local->success_count == priv->child_count) - goto out; +int +afr_mark_new_entry_changelog_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) +{ + int call_count = 0; - if (!afr_is_new_entry_changelog_needed (local->op)) - goto out; + call_count = afr_frame_return(frame); - afr_mark_new_entry_changelog (frame, this); + if (call_count == 0) + AFR_STACK_DESTROY(frame); -out: - return; + return 0; } void -afr_dir_fop_done (call_frame_t *frame, xlator_t *this) +afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (local->cont.dir_fop.inode == NULL) - goto done; - afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, - local->fresh_children, - local->read_child_index, - priv->read_child, - local->cont.dir_fop.buf.ia_gfid); -done: - local->transaction.unwind (frame, this); - afr_dir_fop_mark_entry_pending_changelog (frame, this); - local->transaction.resume (frame, this); + call_frame_t *new_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *new_local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int32_t **changelog = NULL; + int i = 0; + int op_errno = ENOMEM; + unsigned char *pending = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + new_frame = copy_frame(frame); + if (!new_frame) + goto out; + + new_local = AFR_FRAME_INIT(new_frame, op_errno); + if (!new_local) + goto out; + + xattr = dict_new(); + if (!xattr) + goto out; + + pending = alloca0(priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + !local->transaction.failed_subvols[i]) { + call_count++; + continue; + } + pending[i] = 1; + } + + changelog = afr_mark_pending_changelog(priv, pending, xattr, + local->cont.dir_fop.buf.ia_type); + if (!changelog) + goto out; + + new_local->pending = changelog; + gf_uuid_copy(new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); + new_local->loc.inode = inode_ref(local->inode); + + new_local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (pending[i]) + continue; + + STACK_WIND_COOKIE(new_frame, afr_mark_new_entry_changelog_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->xattrop, &new_local->loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); + if (!--call_count) + break; + } + + new_frame = NULL; +out: + if (new_frame) + AFR_STACK_DESTROY(new_frame); + if (xattr) + dict_unref(xattr); + return; } -/* {{{ create */ - -int -afr_create_unwind (call_frame_t *frame, xlator_t *this) +void +afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_count = 0; + int failed_count = 0; + unsigned char *success_replies = NULL; - local = frame->local; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (create, main_frame, - local->op_ret, local->op_errno, - local->cont.create.fd, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - local->xdata_rsp); - } + if (local->op_ret < 0) + return; - return 0; -} + if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD && + local->op != GF_FOP_MKDIR) + return; + pre_op_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); + failed_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); -int -afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - afr_local_t *local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret > -1) { - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set ctx on fd=%p", fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - fd_ctx->flags = local->cont.create.flags; - - if (local->success_count == 0) { - if (xdata) - local->xdata_rsp = dict_ref(xdata); - } - } - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } + /* FOP succeeded on all bricks. */ + if (pre_op_count == priv->child_count && !failed_count) + return; -unlock: - UNLOCK (&frame->lock); + /* FOP did not suceed on quorum no. of bricks. */ + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + if (!afr_has_quorum(success_replies, this, NULL)) + return; - call_count = afr_frame_return (frame); + if (priv->thin_arbiter_count) { + /*Mark new entry using ta file*/ + local->is_new_entry = _gf_true; + return; + } - if (call_count == 0) - afr_dir_fop_done (frame, this); + afr_mark_new_entry_changelog(frame, this); - return 0; + return; } +/* {{{ create */ int -afr_create_wind (call_frame_t *frame, xlator_t *this) +afr_create_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local = frame->local; - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->create, - &local->loc, - local->cont.create.flags, - local->cont.create.mode, - local->umask, - local->cont.create.fd, - local->xdata_req); - if (!--call_count) - break; - } - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(create, main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, local->inode, + &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_create_done (call_frame_t *frame, xlator_t *this) +afr_create_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) +afr_create_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - QUORUM_CHECK(create,out); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->op = GF_FOP_CREATE; - local->cont.create.flags = flags; - local->cont.create.mode = mode; - local->cont.create.fd = fd_ref (fd); - local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - - local->transaction.fop = afr_create_wind; - local->transaction.done = afr_create_done; - local->transaction.unwind = afr_create_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_create_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->create, &local->loc, + local->cont.create.flags, local->cont.create.mode, + local->umask, local->cont.create.fd, local->xdata_req); + return 0; +} - ret = 0; +int +afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + + local->fd_ctx = afr_fd_ctx_get(fd, this); + if (!local->fd_ctx) + goto out; + + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->op = GF_FOP_CREATE; + local->cont.create.flags = flags; + local->fd_ctx->flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref(fd); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_create_wind; + local->transaction.unwind = afr_create_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ @@ -509,814 +502,436 @@ out: /* {{{ mknod */ int -afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +afr_mknod_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (mknod, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - - -int -afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + AFR_STACK_UNWIND(mknod, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - -int32_t -afr_mknod_wind (call_frame_t *frame, xlator_t *this) +int +afr_mknod_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev, - local->umask, - local->xdata_req); - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_mknod_done (call_frame_t *frame, xlator_t *this) +afr_mknod_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_mknod_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->mknod, &local->loc, + local->cont.mknod.mode, local->cont.mknod.dev, + local->umask, local->xdata_req); + return 0; } - int -afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t dev, mode_t umask, dict_t *params) +afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - QUORUM_CHECK(mknod,out); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->op = GF_FOP_MKNOD; - local->cont.mknod.mode = mode; - local->cont.mknod.dev = dev; - local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - - local->transaction.fop = afr_mknod_wind; - local->transaction.done = afr_mknod_done; - local->transaction.unwind = afr_mknod_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->op = GF_FOP_MKNOD; + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_mknod_wind; + local->transaction.unwind = afr_mknod_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ mkdir */ - int -afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +afr_mkdir_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (mkdir, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - - -int -afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - int call_count = -1; - int child_index = -1; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + AFR_STACK_UNWIND(mkdir, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, local->cont.mkdir.mode, - local->umask, - local->xdata_req); - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_mkdir_done (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; - return 0; + STACK_WIND_COOKIE(frame, afr_mkdir_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->mkdir, &local->loc, + local->cont.mkdir.mode, local->umask, local->xdata_req); + return 0; } int -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *params) +afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - QUORUM_CHECK(mkdir,out); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->cont.mkdir.mode = mode; - local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - - local->op = GF_FOP_MKDIR; - local->transaction.fop = afr_mkdir_wind; - local->transaction.done = afr_mkdir_done; - local->transaction.unwind = afr_mkdir_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.mkdir.mode = mode; + local->umask = umask; + + if (!xdata || !dict_get_sizen(xdata, "gfid-req")) { + op_errno = EPERM; + gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_GFID_NULL, + "mkdir: %s is received " + "without gfid-req %p", + loc->path, xdata); + goto out; + } + + local->xdata_req = dict_copy_with_ref(xdata, NULL); + if (!local->xdata_req) { + op_errno = ENOMEM; + goto out; + } + + local->op = GF_FOP_MKDIR; + local->transaction.wind = afr_mkdir_wind; + local->transaction.unwind = afr_mkdir_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ link */ - int -afr_link_unwind (call_frame_t *frame, xlator_t *this) +afr_link_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (link, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(link, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +afr_link_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_link_wind (call_frame_t *frame, xlator_t *this) +afr_link_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->link, - &local->loc, - &local->newloc, local->xdata_req); - - if (!--call_count) - break; - } - } + local = frame->local; + priv = this->private; - return 0; + STACK_WIND_COOKIE(frame, afr_link_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->link, &local->loc, + &local->newloc, local->xdata_req); + return 0; } - int -afr_link_done (call_frame_t *frame, xlator_t *this) +afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - AFR_STACK_DESTROY (frame); - - return 0; -} + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; -int -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + loc_copy(&local->loc, oldloc); + loc_copy(&local->newloc, newloc); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->inode = inode_ref(oldloc->inode); + local->parent = inode_ref(newloc->parent); - priv = this->private; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - QUORUM_CHECK(link,out); + if (!local->xdata_req) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } + local->op = GF_FOP_LINK; - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; + local->transaction.wind = afr_link_wind; + local->transaction.unwind = afr_link_unwind; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + ret = afr_build_parent_loc(&local->transaction.parent_loc, newloc, + &op_errno); + if (ret) + goto out; - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); - if (xdata) - local->xdata_req = dict_ref (xdata); + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(newloc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->op = GF_FOP_LINK; - local->transaction.fop = afr_link_wind; - local->transaction.done = afr_link_done; - local->transaction.unwind = afr_link_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (newloc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ symlink */ - int -afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +afr_symlink_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (symlink, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - - -int -afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - return 0; + AFR_STACK_UNWIND(symlink, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - int -afr_symlink_wind (call_frame_t *frame, xlator_t *this) +afr_symlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - local->cont.symlink.linkpath, - &local->loc, - local->umask, - local->xdata_req); - - if (!--call_count) - break; - - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_symlink_done (call_frame_t *frame, xlator_t *this) +afr_symlink_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_symlink_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->symlink, + local->cont.symlink.linkpath, &local->loc, local->umask, + local->xdata_req); + return 0; } - int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) +afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - QUORUM_CHECK(symlink,out); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->cont.symlink.linkpath = gf_strdup (linkpath); - local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - - local->op = GF_FOP_SYMLINK; - local->transaction.fop = afr_symlink_wind; - local->transaction.done = afr_symlink_done; - local->transaction.unwind = afr_symlink_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.symlink.linkpath = gf_strdup(linkpath); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_SYMLINK; + local->transaction.wind = afr_symlink_wind; + local->transaction.unwind = afr_symlink_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ @@ -1324,233 +939,118 @@ out: /* {{{ rename */ int -afr_rename_unwind (call_frame_t *frame, xlator_t *this) +afr_rename_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rename, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - &local->cont.dir_fop.prenewparent, - &local->cont.dir_fop.postnewparent, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - - -int -afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent, - dict_t *xdata) -{ - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) - afr_transaction_fop_failed (frame, this, child_index); - local->op_errno = op_errno; - local->child_errno[child_index] = op_errno; - - if (op_ret > -1) - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, buf, - preoldparent, postoldparent, - prenewparent, postnewparent); - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + AFR_STACK_UNWIND(rename, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, local->xdata_rsp); + return 0; } - -int32_t -afr_rename_wind (call_frame_t *frame, xlator_t *this) +int +afr_rename_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rename, - &local->loc, - &local->newloc, NULL); - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); } - int -afr_rename_done (call_frame_t *frame, xlator_t *this) +afr_rename_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; - return 0; + STACK_WIND_COOKIE(frame, afr_rename_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->rename, &local->loc, + &local->newloc, local->xdata_req); + return 0; } - int -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - int nlockee = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - QUORUM_CHECK(rename,out); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); - - local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); - - local->op = GF_FOP_RENAME; - local->transaction.fop = afr_rename_wind; - local->transaction.done = afr_rename_done; - local->transaction.unwind = afr_rename_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, - &op_errno); - if (ret) - goto out; - ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = nlockee = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->transaction.new_parent_loc, - local->transaction.new_basename, - priv->child_count); - if (ret) - goto out; - - nlockee++; - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - nlockee++; - if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->newloc, - NULL, - priv->child_count); - if (ret) - goto out; - - nlockee++; - } - qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), - afr_entry_lockee_cmp); - int_lock->lockee_count = nlockee; - - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, oldloc); + loc_copy(&local->newloc, newloc); + + local->inode = inode_ref(oldloc->inode); + local->parent = inode_ref(oldloc->parent); + local->parent2 = inode_ref(newloc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_RENAME; + local->transaction.wind = afr_rename_wind; + local->transaction.unwind = afr_rename_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, oldloc, + &op_errno); + if (ret) + goto out; + ret = afr_build_parent_loc(&local->transaction.new_parent_loc, newloc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(oldloc->path); + local->transaction.new_basename = AFR_BASENAME(newloc->path); + ret = afr_transaction(transaction_frame, this, + AFR_ENTRY_RENAME_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (rename, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ @@ -1558,405 +1058,205 @@ out: /* {{{ unlink */ int -afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +afr_unlink_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (unlink, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - -int -afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - - LOCK (&frame->lock); - { - if (child_index == local->read_child_index) { - local->read_child_returned = _gf_true; - } - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, NULL, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + AFR_STACK_UNWIND(unlink, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->unlink, - &local->loc, local->xflag, - local->xdata_req); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } - -int32_t -afr_unlink_done (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; - return 0; + STACK_WIND_COOKIE(frame, afr_unlink_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->unlink, &local->loc, + local->xflag, local->xdata_req); + return 0; } - -int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata) +int +afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - QUORUM_CHECK(unlink,out); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - loc_copy (&local->loc, loc); - local->xflag = xflag; - if (xdata) - local->xdata_req = dict_ref (xdata); - - local->op = GF_FOP_UNLINK; - local->transaction.fop = afr_unlink_wind; - local->transaction.done = afr_unlink_done; - local->transaction.unwind = afr_unlink_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->xflag = xflag; + + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_UNLINK; + local->transaction.wind = afr_unlink_wind; + local->transaction.unwind = afr_unlink_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, -1, op_errno, - NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ rmdir */ - - int -afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +afr_rmdir_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rmdir, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - - -int -afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - int read_child = 0; - - local = frame->local; - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) - afr_transaction_fop_failed (frame, this, child_index); - local->op_errno = op_errno; - local->child_errno[child_index] = op_errno; - if (op_ret > -1) - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, NULL, - preparent, postparent, NULL, - NULL); - - } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + AFR_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags, - NULL); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } - int -afr_rmdir_done (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + local = frame->local; + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_rmdir_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->rmdir, &local->loc, + local->cont.rmdir.flags, local->xdata_req); + return 0; } - int -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, dict_t *xdata) +afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - int nlockee = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - QUORUM_CHECK(rmdir,out); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->cont.rmdir.flags = flags; - loc_copy (&local->loc, loc); - - local->op = GF_FOP_RMDIR; - local->transaction.fop = afr_rmdir_wind; - local->transaction.done = afr_rmdir_done; - local->transaction.unwind = afr_rmdir_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = nlockee = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - nlockee++; - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->loc, - NULL, - priv->child_count); - if (ret) - goto out; - - nlockee++; - qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), - afr_entry_lockee_cmp); - int_lock->lockee_count = nlockee; - - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.rmdir.flags = flags; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_RMDIR; + local->transaction.wind = afr_rmdir_wind; + local->transaction.unwind = afr_rmdir_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h index 02f0a3682d9..1d88c3b9b26 100644 --- a/xlators/cluster/afr/src/afr-dir-write.h +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -12,36 +12,35 @@ #define __DIR_WRITE_H__ int32_t -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *xdata); +afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); int32_t -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); +afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata); int32_t -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); +afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata); +afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); int32_t -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, dict_t *xdata); +afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); int32_t -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata); +afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); int32_t -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata); +afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params); +afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *oldloc, mode_t umask, dict_t *params); #endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 00dd22214cb..c5521704de2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -8,7 +8,6 @@ cases as published by the Free Software Foundation. */ - #include <libgen.h> #include <unistd.h> #include <fnmatch.h> @@ -16,1925 +15,1880 @@ #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/list.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/quota-common-utils.h> + +#include "afr-transaction.h" +#include "afr-messages.h" -/** - * Common algorithm for inode read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: access, stat, fstat, readlink, getxattr - */ +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +int +afr_handle_quota_size(call_frame_t *frame, xlator_t *this) +{ + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + int ret = 0; + quota_meta_t size = { + 0, + }; + quota_meta_t max_size = { + 0, + }; + int readable_cnt = 0; + int read_subvol = -1; + + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0(priv->child_count); + + afr_inode_read_subvol_get(local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT(readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY, + SLEN(QUOTA_SIZE_KEY), &size); + if (ret == -1) + continue; + if (read_subvol == -1) + read_subvol = i; + if (size.size > max_size.size || + (size.file_count + size.dir_count) > + (max_size.file_count + max_size.dir_count)) + read_subvol = i; + + if (size.size > max_size.size) + max_size.size = size.size; + if (size.file_count > max_size.file_count) + max_size.file_count = size.file_count; + if (size.dir_count > max_size.dir_count) + max_size.dir_count = size.dir_count; + } + + if (max_size.size == 0 && max_size.file_count == 0 && + max_size.dir_count == 0) + return read_subvol; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + quota_dict_set_meta(replies[i].xdata, QUOTA_SIZE_KEY, &max_size, + IA_IFDIR); + } + + return read_subvol; +} /* {{{ access */ -int32_t -afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.access.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->access, - &local->loc, local->cont.access.mask, - NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); - } + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + afr_read_txn_continue(frame, this, (long)cookie); return 0; -} - - -int32_t -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + AFR_STACK_UNWIND(access, frame, op_ret, op_errno, xdata); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + return 0; +} - children = priv->children; +int +afr_access_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; + local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (subvol == -1) { + AFR_STACK_UNWIND(access, frame, local->op_ret, local->op_errno, 0); + return 0; + } - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + STACK_WIND_COOKIE(frame, afr_access_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->access, &local->loc, + local->cont.access.mask, local->xdata_req); + return 0; +} +int +afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.access.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - loc_copy (&local->loc, loc); - local->cont.access.mask = mask; + local->op = GF_FOP_ACCESS; + loc_copy(&local->loc, loc); + local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref(xdata); - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->access, - loc, mask, xdata); + afr_read_txn(frame, this, loc->inode, afr_access_wind, + AFR_METADATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); - return 0; -} + AFR_STACK_UNWIND(access, frame, -1, op_errno, NULL); + return 0; +} /* }}} */ /* {{{ stat */ -int32_t -afr_stat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) +int +afr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + afr_local_t *local = NULL; - priv = this->private; - children = priv->children; + local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - local = frame->local; - - if (op_ret == -1) { - last_index = &local->cont.stat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - - STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->stat, - &local->loc, NULL); - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } -out: - if (unwind) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); - } + AFR_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata); - return 0; + return 0; } - -int32_t -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +int +afr_stat_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; + local = frame->local; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; + if (subvol == -1) { + AFR_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, 0, 0); + return 0; + } - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + STACK_WIND_COOKIE( + frame, afr_stat_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->stat, &local->loc, local->xdata_req); + return 0; +} - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; +int +afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.stat.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&local->loc, loc); + local->op = GF_FOP_STAT; + loc_copy(&local->loc, loc); + if (xdata) + local->xdata_req = dict_ref(xdata); - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->stat, - loc, xdata); + afr_read_txn(frame, this, loc->inode, afr_stat_wind, AFR_DATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } - /* }}} */ /* {{{ fstat */ -int32_t -afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) +int +afr_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.fstat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - - STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fstat, - local->fd, NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - } + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + afr_read_txn_continue(frame, this, (long)cookie); return 0; -} - - -int32_t -afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = 0; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + } - children = priv->children; - - VALIDATE_OR_GOTO (fd->inode, out); - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + AFR_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + return 0; +} - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; +int +afr_fstat_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + priv = this->private; + local = frame->local; - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); + if (subvol == -1) { + AFR_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, 0, 0); + return 0; + } + STACK_WIND_COOKIE( + frame, afr_fstat_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->fstat, local->fd, local->xdata_req); + return 0; +} +int32_t +afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.fstat.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->fd = fd_ref (fd); + local->op = GF_FOP_FSTAT; + local->fd = fd_ref(fd); + if (xdata) + local->xdata_req = dict_ref(xdata); - afr_open_fd_fix (fd, this); + afr_fix_open(fd, this); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fstat, - fd, xdata); + afr_read_txn(frame, this, fd->inode, afr_fstat_wind, AFR_DATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } /* }}} */ /* {{{ readlink */ -int32_t -afr_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *buf, struct iatt *sbuf, dict_t *xdata) +int +afr_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *buf, + struct iatt *sbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; - - local = frame->local; + afr_local_t *local = NULL; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.readlink.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readlink, - &local->loc, - local->cont.readlink.size, NULL); - } + local = frame->local; -out: - if (unwind) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, - xdata); - } + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + afr_read_txn_continue(frame, this, (long)cookie); return 0; -} + } + AFR_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, sbuf, xdata); + return 0; +} -int32_t -afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size, dict_t *xdata) +int +afr_readlink_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + local = frame->local; + priv = this->private; - children = priv->children; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + if (subvol == -1) { + AFR_STACK_UNWIND(readlink, frame, local->op_ret, local->op_errno, 0, 0, + 0); + return 0; + } - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + STACK_WIND_COOKIE(frame, afr_readlink_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readlink, &local->loc, + local->cont.readlink.size, local->xdata_req); + return 0; +} - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readlink.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } +int +afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; - loc_copy (&local->loc, loc); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->cont.readlink.size = size; + local->op = GF_FOP_READLINK; + loc_copy(&local->loc, loc); + local->cont.readlink.size = size; + if (xdata) + local->xdata_req = dict_ref(xdata); - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readlink, - loc, size, xdata); + afr_read_txn(frame, this, loc->inode, afr_readlink_wind, + AFR_DATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); - return 0; -} + AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); + return 0; +} /* }}} */ /* {{{ getxattr */ struct _xattr_key { - char *key; - struct list_head list; + char *key; + struct list_head list; }; - int -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) +__gather_xattr_keys(dict_t *dict, char *key, data_t *value, void *data) { - struct list_head * list = data; - struct _xattr_key * xkey = NULL; + struct list_head *list = data; + struct _xattr_key *xkey = NULL; - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { + if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) { + xkey = GF_MALLOC(sizeof(*xkey), gf_afr_mt_xattr_key); + if (!xkey) + return -1; - xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); - if (!xkey) - return -1; + xkey->key = key; + INIT_LIST_HEAD(&xkey->list); - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); - - list_add_tail (&xkey->list, list); - } - return 0; + list_add_tail(&xkey->list, list); + } + return 0; } - void -__filter_xattrs (dict_t *dict) +afr_filter_xattrs(dict_t *dict) { - struct list_head keys = {0,}; - struct _xattr_key *key = NULL; - struct _xattr_key *tmp = NULL; + struct list_head keys = { + 0, + }; + struct _xattr_key *key = NULL; + struct _xattr_key *tmp = NULL; - INIT_LIST_HEAD (&keys); + INIT_LIST_HEAD(&keys); - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); + dict_foreach(dict, __gather_xattr_keys, (void *)&keys); - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); + list_for_each_entry_safe(key, tmp, &keys, list) + { + dict_del(dict, key->key); - list_del_init (&key->list); + list_del_init(&key->list); - GF_FREE (key); - } + GF_FREE(key); + } } +static gf_boolean_t +afr_getxattr_ignorable_errnos(int32_t op_errno) +{ + if (op_errno == ENODATA || op_errno == ENOTSUP || op_errno == ERANGE || + op_errno == ENAMETOOLONG) + return _gf_true; - -int32_t -afr_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) + return _gf_false; +} +int +afr_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, - NULL); - } + if (op_ret < 0 && !afr_getxattr_ignorable_errnos(op_errno)) { + local->op_ret = op_ret; + local->op_errno = op_errno; -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); - } + if (dict) + afr_filter_xattrs(dict); + + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + + return 0; +} +int +afr_getxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, NULL, + NULL); return 0; + } + + STACK_WIND_COOKIE(frame, afr_getxattr_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->getxattr, &local->loc, + local->cont.getxattr.name, local->xdata_req); + return 0; } int32_t -afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict, + dict_t *xdata) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; } int32_t -afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - xlator_t **children = NULL; - dict_t *xattr = NULL; - char *tmp_report = NULL; - char lk_summary[1024] = {0,}; - int serz_len = 0; - int32_t callcnt = 0; - long int cky = 0; - int ret = 0; - - priv = this->private; - children = priv->children; - - local = frame->local; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) - local->child_errno[cky] = op_errno; - - if (!local->dict) - local->dict = dict_new (); - if (local->dict) { - ret = dict_get_str (dict, local->cont.getxattr.name, - &tmp_report); - if (ret) - goto unlock; - ret = dict_set_dynstr (local->dict, - children[cky]->name, - gf_strdup (tmp_report)); - if (ret) - goto unlock; - } - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = { + 0, + }; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + int keylen = 0; + int children_keylen = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + children_keylen = strlen(children[cky]->name); + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new(); + if (local->dict) { + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstrn(local->dict, children[cky]->name, + children_keylen, gf_strdup(tmp_report)); + if (ret) + goto unlock; + } + } unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - xattr = dict_new (); - if (!xattr) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - ret = dict_serialize_value_with_delim (local->dict, - lk_summary, - &serz_len, '\n'); - if (ret) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Error serializing dictionary"); - goto unwind; - } - if (serz_len == -1) - snprintf (lk_summary, sizeof (lk_summary), - "No locks cleared."); - ret = dict_set_dynstr (xattr, local->cont.getxattr.name, - gf_strdup (lk_summary)); - if (ret) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Error setting dictionary"); - goto unwind; - } - - unwind: - // Updating child_errno with more recent 'events' - local->child_errno[cky] = op_errno; - op_errno = afr_resultant_errno_get (NULL, local->child_errno, - priv->child_count); - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, - xdata); - - if (xattr) - dict_unref (xattr); - } + UNLOCK(&frame->lock); + + if (!callcnt) { + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim(local->dict, lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + if (serz_len == -1) + snprintf(lk_summary, sizeof(lk_summary), "No locks cleared."); + ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen, + gf_strdup(lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED, + "Error setting dictionary"); + goto unwind; + } + + op_errno = afr_final_errno(local, priv); + + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata); + if (xattr) + dict_unref(xattr); + } + + return ret; +} - return ret; +int32_t +afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = { + 0, + }; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + int keylen = 0; + int children_keylen = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long)cookie; + + keylen = strlen(local->cont.getxattr.name); + children_keylen = strlen(children[cky]->name); + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new(); + if (local->dict) { + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstrn(local->dict, children[cky]->name, + children_keylen, gf_strdup(tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK(&frame->lock); + + if (!callcnt) { + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim(local->dict, lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + if (serz_len == -1) + snprintf(lk_summary, sizeof(lk_summary), "No locks cleared."); + ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen, + gf_strdup(lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED, + "Error setting dictionary"); + goto unwind; + } + + op_errno = afr_final_errno(local, priv); + + unwind: + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); + + if (xattr) + dict_unref(xattr); + } + + return ret; } +/** + * node-uuid cbk uses next child querying mechanism + */ int32_t -afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_node_uuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - xlator_t **children = NULL; - dict_t *xattr = NULL; - char *tmp_report = NULL; - char lk_summary[1024] = {0,}; - int serz_len = 0; - int32_t callcnt = 0; - long int cky = 0; - int ret = 0; - - priv = this->private; - children = priv->children; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int curr_call_child = 0; - local = frame->local; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) - local->child_errno[cky] = op_errno; - - if (!local->dict) - local->dict = dict_new (); - if (local->dict) { - ret = dict_get_str (dict, local->cont.getxattr.name, - &tmp_report); - if (ret) - goto unlock; - ret = dict_set_dynstr (local->dict, - children[cky]->name, - gf_strdup (tmp_report)); - if (ret) - goto unlock; - } - } -unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - xattr = dict_new (); - if (!xattr) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - ret = dict_serialize_value_with_delim (local->dict, - lk_summary, - &serz_len, '\n'); - if (ret) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Error serializing dictionary"); - goto unwind; - } - if (serz_len == -1) - snprintf (lk_summary, sizeof (lk_summary), - "No locks cleared."); - ret = dict_set_dynstr (xattr, local->cont.getxattr.name, - gf_strdup (lk_summary)); - if (ret) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Error setting dictionary"); - goto unwind; - } + priv = this->private; + children = priv->children; - unwind: - // Updating child_errno with more recent 'events' - local->child_errno[cky] = op_errno; - op_errno = afr_resultant_errno_get (NULL, local->child_errno, - priv->child_count); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + local = frame->local; - if (xattr) - dict_unref (xattr); - } + if (op_ret == -1) { /** query the _next_ child */ - return ret; + /** + * _current_ becomes _next_ + * If done with all children and yet no success; give up ! + */ + curr_call_child = (int)((long)cookie); + if (++curr_call_child == priv->child_count) + goto unwind; + + gf_msg_debug(this->name, op_errno, + "op_ret (-1): Re-querying afr-child (%d/%d)", + curr_call_child, priv->child_count); + + unwind = 0; + STACK_WIND_COOKIE( + frame, afr_getxattr_node_uuid_cbk, (void *)(long)curr_call_child, + children[curr_call_child], + children[curr_call_child]->fops->getxattr, &local->loc, + local->cont.getxattr.name, local->xdata_req); + } + +unwind: + if (unwind) + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + + return 0; } /** - * node-uuid cbk uses next child querying mechanism + * list-node-uuids cbk returns the list of node_uuids for the subvolume. */ int32_t -afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_list_node_uuids_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int curr_call_child = 0; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr_serz = NULL; + long cky = 0; + int32_t tlen = 0; + + local = frame->local; + priv = this->private; + cky = (long)cookie; + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + local->replies[cky].valid = 1; + local->replies[cky].op_ret = op_ret; + local->replies[cky].op_errno = op_errno; + + if (op_ret < 0) + goto unlock; + + local->op_ret = 0; + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + local->replies[cky].xattr = dict_ref(dict); + } - local = frame->local; - - if (op_ret == -1) { /** query the _next_ child */ - - /** - * _current_ becomes _next_ - * If done with all childs and yet no success; give up ! - */ - curr_call_child = (int) ((long)cookie); - if (++curr_call_child == priv->child_count) - goto unwind; - - gf_log (this->name, GF_LOG_WARNING, - "op_ret (-1): Re-querying afr-child (%d/%d)", - curr_call_child, priv->child_count); - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, - (void *) (long) curr_call_child, - children[curr_call_child], - children[curr_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, - NULL); +unlock: + UNLOCK(&frame->lock); + + if (!callcnt) { + if (local->op_ret != 0) { + /* All bricks gave an error. */ + local->op_errno = afr_final_errno(local, priv); + goto unwind; + } + + /*Since we store the UUID0_STR as node uuid for down bricks and + *for non zero op_ret, assigning length to priv->child_count + *number of uuids*/ + local->cont.getxattr.xattr_len = (SLEN(UUID0_STR) + 2) * + priv->child_count; + + if (!local->dict) + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + xattr_serz = GF_CALLOC(local->cont.getxattr.xattr_len, sizeof(char), + gf_common_mt_char); + + if (!xattr_serz) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + ret = afr_serialize_xattrs_with_delimiter(frame, this, xattr_serz, + UUID0_STR, &tlen, ' '); + if (ret) { + local->op_ret = -1; + local->op_errno = ENOMEM; + GF_FREE(xattr_serz); + goto unwind; + } + ret = dict_set_dynstr_sizen(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set node_uuid key in dict"); + local->op_ret = -1; + local->op_errno = ENOMEM; + if (ret == -EINVAL) + GF_FREE(xattr_serz); + } else { + local->op_ret = local->cont.getxattr.xattr_len - 1; + local->op_errno = 0; } - unwind: - if (unwind) - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, - NULL); + unwind: + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + local->dict, local->xdata_rsp); + } - return 0; + return ret; } int32_t -afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_quota_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - int call_cnt = 0, len = 0; - char *lockinfo_buf = NULL; - dict_t *lockinfo = NULL, *newdict = NULL; - afr_local_t *local = NULL; + int idx = (long)cookie; + int call_count = 0; + afr_local_t *local = frame->local; + int read_subvol = -1; + + local->replies[idx].valid = 1; + local->replies[idx].op_ret = op_ret; + local->replies[idx].op_errno = op_errno; + if (dict) + local->replies[idx].xdata = dict_ref(dict); + call_count = afr_frame_return(frame); + if (call_count == 0) { + local->inode = inode_ref(local->loc.inode); + read_subvol = afr_handle_quota_size(frame, this); + if (read_subvol != -1) { + op_ret = local->replies[read_subvol].op_ret; + op_errno = local->replies[read_subvol].op_errno; + dict = local->replies[read_subvol].xdata; + } + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + } + + return 0; +} - LOCK (&frame->lock); - { - local = frame->local; +int32_t +afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; - call_cnt = --local->call_count; + LOCK(&frame->lock); + { + local = frame->local; - if ((op_ret < 0) || (!dict && !xdata)) { - goto unlock; - } + call_cnt = --local->call_count; - if (xdata) { - if (!local->xdata_rsp) { - local->xdata_rsp = dict_new (); - if (!local->xdata_rsp) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - } - } + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } - if (!dict) { - goto unlock; + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; } + } + } - op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, - (void **)&lockinfo_buf, &len); + if (!dict) { + goto unlock; + } - if (!lockinfo_buf) { - goto unlock; - } + op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); - if (!local->dict) { - local->dict = dict_new (); - if (!local->dict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - } - } -unlock: - UNLOCK (&frame->lock); - - if (lockinfo_buf != NULL) { - lockinfo = dict_new (); - if (lockinfo == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - } else { - op_ret = dict_unserialize (lockinfo_buf, len, - &lockinfo); - - if (lockinfo && local->dict) { - dict_copy (lockinfo, local->dict); - } - } + if (!lockinfo_buf) { + goto unlock; } - if (xdata && local->xdata_rsp) { - dict_copy (xdata, local->xdata_rsp); + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } } + } +unlock: + UNLOCK(&frame->lock); - if (!call_cnt) { - newdict = dict_new (); - if (!newdict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + if (lockinfo_buf != NULL) { + lockinfo = dict_new(); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); - len = dict_serialized_length (local->dict); - if (len == 0) { - goto unwind; - } + if (lockinfo && local->dict) { + dict_copy(lockinfo, local->dict); + } + } + } - lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); - if (!lockinfo_buf) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + if (xdata && local->xdata_rsp) { + dict_copy(xdata, local->xdata_rsp); + } - op_ret = dict_serialize (local->dict, lockinfo_buf); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - } + if (!call_cnt) { + newdict = dict_new(); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, - (void *)lockinfo_buf, len); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - goto unwind; - } + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { + local->op_ret = -1; + goto unwind; + } - unwind: - AFR_STACK_UNWIND (getxattr, frame, op_ret, - op_errno, newdict, - local->xdata_rsp); + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; } - dict_unref (lockinfo); + unwind: + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict, + local->xdata_rsp); + } - return 0; + dict_unref(lockinfo); + + return 0; } int32_t -afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - int call_cnt = 0, len = 0; - char *lockinfo_buf = NULL; - dict_t *lockinfo = NULL, *newdict = NULL; - afr_local_t *local = NULL; - - LOCK (&frame->lock); - { - local = frame->local; + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; - call_cnt = --local->call_count; + LOCK(&frame->lock); + { + local = frame->local; - if ((op_ret < 0) || (!dict && !xdata)) { - goto unlock; - } + call_cnt = --local->call_count; - if (xdata) { - if (!local->xdata_rsp) { - local->xdata_rsp = dict_new (); - if (!local->xdata_rsp) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - } - } + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } - if (!dict) { - goto unlock; + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; } + } + } - op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, - (void **)&lockinfo_buf, &len); + if (!dict) { + goto unlock; + } - if (!lockinfo_buf) { - goto unlock; - } + op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); - if (!local->dict) { - local->dict = dict_new (); - if (!local->dict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - } - } -unlock: - UNLOCK (&frame->lock); - - if (lockinfo_buf != NULL) { - lockinfo = dict_new (); - if (lockinfo == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - } else { - op_ret = dict_unserialize (lockinfo_buf, len, - &lockinfo); - - if (lockinfo && local->dict) { - dict_copy (lockinfo, local->dict); - } - } + if (!lockinfo_buf) { + goto unlock; } - if (xdata && local->xdata_rsp) { - dict_copy (xdata, local->xdata_rsp); + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } } + } +unlock: + UNLOCK(&frame->lock); - if (!call_cnt) { - newdict = dict_new (); - if (!newdict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + if (lockinfo_buf != NULL) { + lockinfo = dict_new(); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); - len = dict_serialized_length (local->dict); - if (len <= 0) { - goto unwind; - } + if (lockinfo && local->dict) { + dict_copy(lockinfo, local->dict); + } + } + } - lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); - if (!lockinfo_buf) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + if (xdata && local->xdata_rsp) { + dict_copy(xdata, local->xdata_rsp); + } - op_ret = dict_serialize (local->dict, lockinfo_buf); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - } + if (!call_cnt) { + newdict = dict_new(); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, - (void *)lockinfo_buf, len); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - goto unwind; - } + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { + local->op_ret = -1; + goto unwind; + } - unwind: - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, - op_errno, newdict, - local->xdata_rsp); + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; } - dict_unref (lockinfo); + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict, + local->xdata_rsp); + } - return 0; + dict_unref(lockinfo); + + return 0; } int32_t -afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; - int ret = 0; - char *xattr = NULL; - char *xattr_serz = NULL; - char xattr_cky[1024] = {0,}; - dict_t *nxattr = NULL; - long cky = 0; - int32_t padding = 0; - int32_t tlen = 0; - - if (!frame || !frame->local || !this) { - gf_log ("", GF_LOG_ERROR, "possible NULL deref"); - goto out; + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + int keylen = 0; + char xattr_cky[1024] = { + 0, + }; + int xattr_cky_len = 0; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld", + local->cont.getxattr.name, cky); + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } - local = frame->local; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (!dict || (op_ret < 0)) - goto out; - - if (!local->dict) - local->dict = dict_new (); - - if (local->dict) { - ret = dict_get_str (dict, - local->cont.getxattr.name, - &xattr); - if (ret) - goto out; - - xattr = gf_strdup (xattr); - - (void)snprintf (xattr_cky, 1024, "%s-%ld", - local->cont.getxattr.name, cky); - ret = dict_set_dynstr (local->dict, - xattr_cky, xattr); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Cannot set xattr cookie key"); - goto out; - } - - local->cont.getxattr.xattr_len - += strlen (xattr) + 1; - } + if (!dict || (op_ret < 0)) + goto unlock; + + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) + goto unlock; } -out: - UNLOCK (&frame->lock); - - if (!callcnt) { - if (!local->cont.getxattr.xattr_len) - goto unwind; - - nxattr = dict_new (); - if (!nxattr) - goto unwind; - - /* extra bytes for decorations (brackets and <>'s) */ - padding += strlen (this->name) - + strlen (AFR_PATHINFO_HEADER) + 4; - local->cont.getxattr.xattr_len += (padding + 2); - - xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, - sizeof (char), gf_common_mt_char); - - if (!xattr_serz) - goto unwind; - - /* the xlator info */ - (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", - this->name); - - /* actual series of pathinfo */ - ret = dict_serialize_value_with_delim (local->dict, - xattr_serz - + strlen (xattr_serz), - &tlen, ' '); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Error serializing" - " dictionary"); - goto unwind; - } + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr); + if (ret) + goto unlock; + + xattr = gf_strdup(xattr); + + ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr); + if (ret) { + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set xattr cookie key"); + goto post_unlock; + } + + local->cont.getxattr.xattr_len += strlen(xattr) + 1; + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new(); + if (!nxattr) + goto unwind; - /* closing part */ - *(xattr_serz + padding + tlen) = ')'; - *(xattr_serz + padding + tlen + 1) = '\0'; + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); - ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, - xattr_serz); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" - " key in dict"); + xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len, + gf_common_mt_char); - unwind: - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, nxattr, - xdata); + if (!xattr_serz) + goto unwind; - if (nxattr) - dict_unref (nxattr); + /* the xlator info */ + int xattr_serz_len = sprintf( + xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim( + local->dict, xattr_serz + xattr_serz_len, &tlen, ' '); + if (ret) { + GF_FREE(xattr_serz); + goto unwind; + } + + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set pathinfo key in dict"); + if (ret == -EINVAL) + GF_FREE(xattr_serz); } - return ret; + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, + nxattr, local->xdata_rsp); + + if (nxattr) + dict_unref(nxattr); + } + +out: + return ret; } int32_t -afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; - int ret = 0; - char *xattr = NULL; - char *xattr_serz = NULL; - char xattr_cky[1024] = {0,}; - dict_t *nxattr = NULL; - long cky = 0; - int32_t padding = 0; - int32_t tlen = 0; - - if (!frame || !frame->local || !this) { - gf_log ("", GF_LOG_ERROR, "possible NULL deref"); - goto out; + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = { + 0, + }; + int keylen = 0; + int xattr_cky_len = 0; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld", + local->cont.getxattr.name, cky); + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } - local = frame->local; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (!dict || (op_ret < 0)) - goto out; - - if (!local->dict) - local->dict = dict_new (); - - if (local->dict) { - ret = dict_get_str (dict, - local->cont.getxattr.name, - &xattr); - if (ret) - goto out; - - xattr = gf_strdup (xattr); - - (void)snprintf (xattr_cky, 1024, "%s-%ld", - local->cont.getxattr.name, cky); - ret = dict_set_dynstr (local->dict, - xattr_cky, xattr); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Cannot set xattr cookie key"); - goto out; - } - - local->cont.getxattr.xattr_len += strlen (xattr) + 1; - } - } - out: - UNLOCK (&frame->lock); - - if (!callcnt) { - if (!local->cont.getxattr.xattr_len) - goto unwind; - - nxattr = dict_new (); - if (!nxattr) - goto unwind; - - /* extra bytes for decorations (brackets and <>'s) */ - padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; - local->cont.getxattr.xattr_len += (padding + 2); - - xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, - sizeof (char), gf_common_mt_char); - - if (!xattr_serz) - goto unwind; - - /* the xlator info */ - (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", - this->name); - - /* actual series of pathinfo */ - ret = dict_serialize_value_with_delim (local->dict, - xattr_serz + strlen (xattr_serz), - &tlen, ' '); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Error serializing" - " dictionary"); - goto unwind; - } + if (!dict || (op_ret < 0)) + goto unlock; + + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) + goto unlock; + } + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr); + if (ret) + goto unlock; + + xattr = gf_strdup(xattr); - /* closing part */ - *(xattr_serz + padding + tlen) = ')'; - *(xattr_serz + padding + tlen + 1) = '\0'; + ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr); + if (ret) { + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set xattr cookie key"); + goto post_unlock; + } + + local->cont.getxattr.xattr_len += strlen(xattr) + 1; + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; - ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, - xattr_serz); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" - " key in dict"); + nxattr = dict_new(); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len, + gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + int xattr_serz_len = sprintf( + xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim( + local->dict, xattr_serz + xattr_serz_len, &tlen, ' '); + if (ret) { + GF_FREE(xattr_serz); + goto unwind; + } - unwind: - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, nxattr, - xdata); + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; - if (nxattr) - dict_unref (nxattr); + ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set pathinfo key in dict"); + if (ret == -EINVAL) + GF_FREE(xattr_serz); } - return ret; + unwind: + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + nxattr, local->xdata_rsp); + + if (nxattr) + dict_unref(nxattr); + } + +out: + return ret; } static int -afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +afr_aggregate_stime_xattr(dict_t *this, char *key, data_t *value, void *data) { - int ret = 0; + int ret = 0; - if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) - ret = gf_get_min_stime (THIS, data, key, value); + if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_max_stime(THIS, data, key, value); - return ret; + return ret; } int32_t -afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_common_getxattr_stime_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; + afr_local_t *local = NULL; + int32_t callcnt = 0; - if (!frame || !frame->local || !this) { - gf_log ("", GF_LOG_ERROR, "possible NULL deref"); - goto out; - } - - local = frame->local; + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } - LOCK (&frame->lock); - { - callcnt = --local->call_count; + local = frame->local; - if (!dict || (op_ret < 0)) { - local->op_errno = op_errno; - goto cleanup; - } + LOCK(&frame->lock); + { + callcnt = --local->call_count; - if (!local->dict) - local->dict = dict_copy_with_ref (dict, NULL); - else - dict_foreach (dict, afr_aggregate_stime_xattr, - local->dict); - local->op_ret = 0; + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; } + if (!local->dict) + local->dict = dict_copy_with_ref(dict, NULL); + else + dict_foreach(dict, afr_aggregate_stime_xattr, local->dict); + local->op_ret = 0; + } + cleanup: - UNLOCK (&frame->lock); + UNLOCK(&frame->lock); - if (!callcnt) { - AFR_STACK_UNWIND (getxattr, frame, local->op_ret, - local->op_errno, local->dict, xdata); - } + if (!callcnt) { + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + local->dict, xdata); + } out: - return 0; + return 0; } - static gf_boolean_t -afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, - gf_boolean_t is_fgetxattr) +afr_is_special_xattr(const char *name, fop_getxattr_cbk_t *cbk, + gf_boolean_t is_fgetxattr) { - gf_boolean_t is_spl = _gf_true; - - GF_ASSERT (cbk); - if (!cbk) { - is_spl = _gf_false; - goto out; + gf_boolean_t is_spl = _gf_true; + + GF_ASSERT(cbk); + if (!cbk || !name) { + is_spl = _gf_false; + goto out; + } + + if (!strcmp(name, GF_XATTR_PATHINFO_KEY) || + !strcmp(name, GF_XATTR_USER_PATHINFO_KEY)) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_pathinfo_cbk; + } else { + *cbk = afr_getxattr_pathinfo_cbk; } - - if (!strcmp (name, GF_XATTR_PATHINFO_KEY)) { - if (is_fgetxattr) { - *cbk = afr_fgetxattr_pathinfo_cbk; - } else { - *cbk = afr_getxattr_pathinfo_cbk; - } - } else if (!strncmp (name, GF_XATTR_CLRLK_CMD, - strlen (GF_XATTR_CLRLK_CMD))) { - if (is_fgetxattr) { - *cbk = afr_fgetxattr_clrlk_cbk; - } else { - *cbk = afr_getxattr_clrlk_cbk; - } - } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, - strlen (GF_XATTR_LOCKINFO_KEY))) { - if (is_fgetxattr) { - *cbk = afr_fgetxattr_lockinfo_cbk; - } else { - *cbk = afr_getxattr_lockinfo_cbk; - } - } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { - *cbk = afr_common_getxattr_stime_cbk; + } else if (!strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_clrlk_cbk; } else { - is_spl = _gf_false; + *cbk = afr_getxattr_clrlk_cbk; } + } else if (!strncmp(name, GF_XATTR_LOCKINFO_KEY, + SLEN(GF_XATTR_LOCKINFO_KEY))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_lockinfo_cbk; + } else { + *cbk = afr_getxattr_lockinfo_cbk; + } + } else if (fnmatch(GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; + } else if (strcmp(name, QUOTA_SIZE_KEY) == 0) { + *cbk = afr_getxattr_quota_size_cbk; + } else if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) { + *cbk = afr_getxattr_list_node_uuids_cbk; + } else { + is_spl = _gf_false; + } out: - return is_spl; + return is_spl; } static void -afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, - const char *name, loc_t *loc, - fop_getxattr_cbk_t cbk) +afr_getxattr_all_subvols(xlator_t *this, call_frame_t *frame, const char *name, + loc_t *loc, fop_getxattr_cbk_t cbk) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + // local->call_count set in afr_local_init + call_count = local->call_count; + + if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) { + GF_FREE(local->cont.getxattr.name); + local->cont.getxattr.name = gf_strdup(GF_XATTR_NODE_UUID_KEY); + } + + // If up-children count is 0, afr_local_init would have failed already + // and the call would have unwound so not handling it here. + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->getxattr, loc, + local->cont.getxattr.name, NULL); + if (!--call_count) + break; + } + } + return; +} - priv = this->private; - children = priv->children; +int +afr_marker_populate_args(call_frame_t *frame, int type, int *gauge, + xlator_t **subvols) +{ + xlator_t *this = frame->this; + afr_private_t *priv = this->private; - local = frame->local; - local->call_count = priv->child_count; + memcpy(subvols, priv->children, sizeof(*subvols) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - STACK_WIND_COOKIE (frame, cbk, - (void *) (long) i, - children[i], children[i]->fops->getxattr, - loc, name, NULL); - } - return; + if (type == MARKER_XTIME_TYPE) { + /*Don't error out on ENOENT/ENOTCONN */ + gauge[MCNT_NOTFOUND] = 0; + gauge[MCNT_ENOTCONN] = 0; + } + return priv->child_count; +} + +static int +afr_handle_heal_xattrs(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *heal_op) +{ + int ret = -1; + afr_spb_status_t *data = NULL; + + if (!strcmp(heal_op, GF_HEAL_INFO)) { + afr_get_heal_info(frame, this, loc); + ret = 0; + goto out; + } + + if (!strcmp(heal_op, GF_AFR_HEAL_SBRAIN)) { + afr_heal_splitbrain_file(frame, this, loc); + ret = 0; + goto out; + } + + if (!strcmp(heal_op, GF_AFR_SBRAIN_STATUS)) { + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spb_status_t); + if (!data) { + ret = 1; + goto out; + } + data->frame = frame; + data->loc = loc; + ret = synctask_new(this->ctx->env, afr_get_split_brain_status, + afr_get_split_brain_status_cbk, NULL, data); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + "Failed to create" + " synctask. Unable to fetch split-brain status" + " for %s.", + loc->name); + ret = 1; + goto out; + } + goto out; + } + +out: + if (ret == 1) { + AFR_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL); + if (data) + GF_FREE(data); + ret = 0; + } + return ret; } int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - xlator_list_t *trav = NULL; - xlator_t **sub_volumes = NULL; - int i = 0; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - fop_getxattr_cbk_t cbk = NULL; - int afr_xtime_gauge[MCNT_MAX] = {0,}; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + int32_t op_errno = 0; + int ret = -1; + fop_getxattr_cbk_t cbk = NULL; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - loc_copy (&local->loc, loc); - if (!name) - goto no_name; + priv = this->private; - local->cont.getxattr.name = gf_strdup (name); + children = priv->children; - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - gf_log (this->name, GF_LOG_INFO, - "%s: no data present for key %s", - loc->path, name); - op_errno = ENODATA; - goto out; - } - if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + loc_copy(&local->loc, loc); - local->marker.call_count = priv->child_count; + local->op = GF_FOP_GETXATTR; - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { + if (xdata) + local->xdata_req = dict_ref(xdata); - *(sub_volumes + i) = trav->xlator; - } + if (!name) + goto no_name; - if (cluster_getmarkerattr (frame, this, loc, name, - local, afr_getxattr_unwind, - sub_volumes, - priv->child_count, - MARKER_UUID_TYPE, - marker_uuid_default_gauge, - priv->vol_uuid)) { - - gf_log (this->name, GF_LOG_INFO, - "%s: failed to get marker attr (%s)", - loc->path, name); - op_errno = EINVAL; - goto out; - } + local->cont.getxattr.name = gf_strdup(name); - return 0; - } + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } - /* - * if we are doing getxattr with pathinfo as the key then we - * collect information from all childs - */ - if (afr_is_special_xattr (name, &cbk, 0)) { - afr_getxattr_frm_all_children (this, frame, name, - loc, cbk); - return 0; - } + if (!strncmp(name, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) { + op_errno = ENODATA; + goto out; + } - if (XATTR_IS_NODE_UUID (name)) { - i = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, - (void *) (long) i, - children[i], - children[i]->fops->getxattr, - loc, name, xdata); - return 0; - } + if (cluster_handle_marker_getxattr(frame, loc, name, priv->vol_uuid, + afr_getxattr_unwind, + afr_marker_populate_args) == 0) + return 0; - if (*priv->vol_uuid) { - if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { - local->marker.call_count = priv->child_count; - - sub_volumes = alloca ( priv->child_count - * sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { - - *(sub_volumes + i) = trav->xlator; - - } - - /* don't err out on getting ENOTCONN (brick down) - * from a subset of the bricks - */ - memcpy (afr_xtime_gauge, marker_xtime_default_gauge, - sizeof (afr_xtime_gauge)); - afr_xtime_gauge[MCNT_NOTFOUND] = 0; - afr_xtime_gauge[MCNT_ENOTCONN] = 0; - if (cluster_getmarkerattr (frame, this, loc, - name, local, - afr_getxattr_unwind, - sub_volumes, - priv->child_count, - MARKER_XTIME_TYPE, - afr_xtime_gauge, - priv->vol_uuid)) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to get marker attr (%s)", - loc->path, name); - op_errno = EINVAL; - goto out; - } - - return 0; - } - } + ret = afr_handle_heal_xattrs(frame, this, &local->loc, name); + if (ret == 0) + return 0; -no_name: - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + /* + * Heal daemons don't have IO threads ... and as a result they + * send this getxattr down and eventually crash :( + */ + op_errno = -1; + GF_CHECK_XATTR_KEY_AND_GOTO(name, IO_THREADS_QUEUE_SIZE_KEY, op_errno, out); + + /* + * Special xattrs which need responses from all subvols + */ + if (afr_is_special_xattr(name, &cbk, 0)) { + afr_getxattr_all_subvols(this, frame, name, loc, cbk); + return 0; + } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (XATTR_IS_NODE_UUID(name)) { + i = 0; + STACK_WIND_COOKIE(frame, afr_getxattr_node_uuid_cbk, (void *)(long)i, + children[i], children[i]->fops->getxattr, loc, name, + xdata); + return 0; + } + +no_name: - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->getxattr, - loc, name, xdata); + afr_read_txn(frame, this, local->loc.inode, afr_getxattr_wind, + AFR_METADATA_TRANSACTION); - ret = 0; + ret = 0; out: - if (ret < 0) - AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); - return 0; + if (ret < 0) + AFR_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL); + return 0; } /* {{{ fgetxattr */ - int32_t -afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fgetxattr, - local->fd, - local->cont.getxattr.name, - NULL); - } + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, - xdata); - } + if (dict) + afr_filter_xattrs(dict); - return 0; -} + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); -int32_t -afr_fgetxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict, dict_t *xdata) + return 0; +} +int +afr_fgetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, NULL, + NULL); return 0; + } + + STACK_WIND_COOKIE(frame, afr_fgetxattr_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fgetxattr, local->fd, + local->cont.getxattr.name, local->xdata_req); + return 0; } static void -afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, - const char *name, fd_t *fd, - fop_fgetxattr_cbk_t cbk) +afr_fgetxattr_all_subvols(xlator_t *this, call_frame_t *frame, + fop_fgetxattr_cbk_t cbk) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; - priv = this->private; - children = priv->children; + priv = this->private; - local = frame->local; - local->call_count = priv->child_count; + local = frame->local; + // local->call_count set in afr_local_init + call_count = local->call_count; - for (i = 0; i < priv->child_count; i++) { - STACK_WIND_COOKIE (frame, cbk, - (void *) (long) i, - children[i], children[i]->fops->fgetxattr, - fd, name, NULL); + // If up-children count is 0, afr_local_init would have failed already + // and the call would have unwound so not handling it here. + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->fgetxattr, local->fd, + local->cont.getxattr.name, NULL); + if (!--call_count) + break; } + } - return; + return; } -int32_t -afr_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) +int +afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t read_child = -1; - fop_fgetxattr_cbk_t cbk = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - frame->local = local; + afr_local_t *local = NULL; + int32_t op_errno = 0; + fop_fgetxattr_cbk_t cbk = NULL; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_FGETXATTR; + local->fd = fd_ref(fd); + if (name) { + local->cont.getxattr.name = gf_strdup(name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + } + if (xdata) + local->xdata_req = dict_ref(xdata); + + /* pathinfo gets handled only in getxattr(), but we need to handle + * lockinfo. + * If we are doing fgetxattr with lockinfo as the key then we + * collect information from all children. + */ + if (afr_is_special_xattr(name, &cbk, 1)) { + afr_fgetxattr_all_subvols(this, frame, cbk); + return 0; + } - op_ret = afr_local_init (local, priv, &op_errno); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + afr_fix_open(fd, this); - local->fd = fd_ref (fd); - if (name) - local->cont.getxattr.name = gf_strdup (name); + afr_read_txn(frame, this, fd->inode, afr_fgetxattr_wind, + AFR_METADATA_TRANSACTION); - /* pathinfo gets handled only in getxattr(), but we need to handle - * lockinfo. - * If we are doing fgetxattr with lockinfo as the key then we - * collect information from all children. - */ - if (afr_is_special_xattr (name, &cbk, 1)) { - afr_fgetxattr_frm_all_children (this, frame, name, - fd, cbk); - return 0; - } + return 0; +out: + AFR_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } +/* }}} */ - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } +/* {{{ readv */ - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fgetxattr, - fd, name, xdata); +int +afr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *buf, struct iobref *iobref, dict_t *xdata) +{ + afr_local_t *local = NULL; - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, - NULL); - } + local = frame->local; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + + afr_read_txn_continue(frame, this, (long)cookie); return 0; + } + + AFR_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf, iobref, + xdata); + return 0; } +int +afr_readv_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; -/* }}} */ + local = frame->local; + priv = this->private; -/* {{{ readv */ + if (subvol == -1) { + AFR_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, 0, 0, 0, + 0, 0); + return 0; + } -/** - * read algorithm: - * - * if the user has specified a read subvolume, use it - * otherwise - - * use the inode number to hash it to one of the subvolumes, and - * read from there (to balance read load) - * - * if any of the above read's fail, try the children in sequence - * beginning at the beginning - */ + STACK_WIND_COOKIE( + frame, afr_readv_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->readv, local->fd, local->cont.readv.size, + local->cont.readv.offset, local->cont.readv.flags, local->xdata_req); + return 0; +} -int32_t -afr_readv_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref, dict_t *xdata) +int +afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t *fresh_children = NULL; - int32_t read_child = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_local_t *local = NULL; + int32_t op_errno = 0; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - children = priv->children; + local->op = GF_FOP_READ; + local->fd = fd_ref(fd); + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + if (xdata) + local->xdata_req = dict_ref(xdata); - local = frame->local; + afr_fix_open(fd, this); - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.readv.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset, - local->cont.readv.flags, - NULL); - } + afr_read_txn(frame, this, fd->inode, afr_readv_wind, AFR_DATA_TRANSACTION); + return 0; out: - if (unwind) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref, xdata); - } + AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); - return 0; + return 0; } +/* }}} */ -int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) +/* {{{ seek */ + +int +afr_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, off_t offset, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - children = priv->children; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + afr_local_t *local = NULL; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readv.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata); + return 0; +} + +int +afr_seek_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; - local->fd = fd_ref (fd); + if (subvol == -1) { + AFR_STACK_UNWIND(seek, frame, local->op_ret, local->op_errno, 0, NULL); + return 0; + } + + STACK_WIND_COOKIE( + frame, afr_seek_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->seek, local->fd, local->cont.seek.offset, + local->cont.seek.what, local->xdata_req); + return 0; +} - local->cont.readv.size = size; - local->cont.readv.offset = offset; - local->cont.readv.flags = flags; +int +afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; - afr_open_fd_fix (fd, this); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readv, - fd, size, offset, flags, xdata); + local->op = GF_FOP_SEEK; + local->fd = fd_ref(fd); + local->cont.seek.offset = offset; + local->cont.seek.what = what; + if (xdata) + local->xdata_req = dict_ref(xdata); - ret = 0; + afr_fix_open(fd, this); + + afr_read_txn(frame, this, fd->inode, afr_seek_wind, AFR_DATA_TRANSACTION); + + return 0; out: - if (ret < 0) { - AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, - NULL, NULL); - } - return 0; -} + AFR_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL); + return 0; +} /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index e4091a793e0..8c982bc7e6f 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -12,31 +12,34 @@ #define __INODE_READ_H__ int32_t -afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask, dict_t *xdata); +afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); int32_t -afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata); +afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); int32_t -afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xdata); +afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); int32_t -afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size, dict_t *xdata); +afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata); +afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata); +afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata); int32_t -afr_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata); - +afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata); + +int +afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata); +int +afr_handle_quota_size(call_frame_t *frame, xlator_t *this); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 5f2d3096f66..1d6e4f3570a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -8,998 +8,779 @@ cases as published by the Free Software Foundation. */ - -#include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" - +#include <glusterfs/glusterfs.h> #include "afr.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include "protocol-common.h" +#include <glusterfs/byte-order.h> #include "afr-transaction.h" -#include "afr-self-heal-common.h" +#include "afr-self-heal.h" +#include "afr-messages.h" -void -__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, - xlator_t *this, int32_t *op_ret, int32_t *op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t *local = NULL; - - local = frame->local; - - if (afr_fop_failed (*op_ret, *op_errno)) { - local->child_errno[child_index] = *op_errno; - - switch (local->op) { - case GF_FOP_TRUNCATE: - case GF_FOP_FTRUNCATE: - if (*op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, - child_index); +static void +__afr_inode_write_finalize(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int ret = 0; + int read_subvol = 0; + struct iatt *stbuf = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_read_subvol_args_t args = { + 0, + }; + + local = frame->local; + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, local->inode, out); + + /*This code needs to stay till DHT sends fops on linked + * inodes*/ + if (!inode_is_linked(local->inode)) { + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == -1) + continue; + if (!gf_uuid_is_null(local->replies[i].poststat.ia_gfid)) { + gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid); + args.ia_type = local->replies[i].poststat.ia_type; break; - default: - afr_transaction_fop_failed (frame, this, child_index); + } else { + ret = dict_get_bin(local->replies[i].xdata, + DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); + if (ret) + continue; + gf_uuid_copy(args.gfid, stbuf->ia_gfid); + args.ia_type = stbuf->ia_type; break; - } - local->op_errno = *op_errno; - goto out; - } - - if ((local->success_count == 0) || (read_child == child_index)) { - local->op_ret = *op_ret; - if (prebuf) - local->cont.inode_wfop.prebuf = *prebuf; - if (postbuf) - local->cont.inode_wfop.postbuf = *postbuf; - } - - local->success_count++; + } + } + } + + if (local->transaction.type == AFR_METADATA_TRANSACTION) { + read_subvol = afr_metadata_subvol_get(local->inode, this, NULL, + local->readable, NULL, &args); + } else { + read_subvol = afr_data_subvol_get(local->inode, this, NULL, + local->readable, NULL, &args); + } + + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + afr_pick_error_xdata(local, priv, local->inode, local->readable, NULL, + NULL); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) + continue; + + /* Order of checks in the compound conditional + below is important. + + - Highest precedence: largest op_ret + - Next precedence: if all op_rets are equal, read subvol + - Least precedence: any succeeded subvol + */ + if ((local->op_ret < local->replies[i].op_ret) || + ((local->op_ret == local->replies[i].op_ret) && + (i == read_subvol))) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.inode_wfop.prebuf = local->replies[i].prestat; + local->cont.inode_wfop.postbuf = local->replies[i].poststat; + + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + if (local->replies[i].xattr) { + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); + local->xattr_rsp = dict_ref(local->replies[i].xattr); + } + } + } + + afr_set_in_flight_sb_status(this, frame, local->inode); out: - return; + return; } -/* {{{ writev */ - -void -afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) +static void +__afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xattr, dict_t *xdata) { - afr_local_t *src_local = NULL; - afr_local_t *dst_local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - src_local = src_frame->local; - dst_local = dst_frame->local; + local = frame->local; + priv = this->private; - dst_local->op_ret = src_local->op_ret; - dst_local->op_errno = src_local->op_errno; - dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; - dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; -} + local->replies[child_index].valid = 1; -void -afr_writev_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - local = frame->local; + if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1) + op_ret = iov_length(local->cont.writev.vector, + local->cont.writev.count); + + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); - AFR_STACK_UNWIND (writev, frame, - local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); + if (op_ret >= 0) { + if (prebuf) + local->replies[child_index].prestat = *prebuf; + if (postbuf) + local->replies[child_index].poststat = *postbuf; + if (xattr) + local->replies[child_index].xattr = dict_ref(xattr); + } else { + afr_transaction_fop_failed(frame, this, child_index); + } + + return; } -call_frame_t* -afr_transaction_detach_fop_frame (call_frame_t *frame) +static int +__afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xattr, dict_t *xdata) { - afr_local_t * local = NULL; - call_frame_t *fop_frame = NULL; + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_private_t *priv = NULL; - local = frame->local; + priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { - fop_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; + LOCK(&frame->lock); + { + __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, + prebuf, postbuf, xattr, xdata); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + __afr_inode_write_finalize(frame, this); + + if (afr_txn_nothing_failed(frame, this)) { + /*if it did pre-op, it will do post-op changing ctime*/ + if (priv->consistent_metadata && afr_needs_changelog_update(local)) + afr_zero_fill_stat(local); + local->transaction.unwind(frame, this); } - UNLOCK (&frame->lock); - return fop_frame; + afr_transaction_resume(frame, this); + } + + return 0; } -int -afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) +/* {{{ writev */ + +void +afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame) { - call_frame_t *fop_frame = NULL; + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; - fop_frame = afr_transaction_detach_fop_frame (frame); + src_local = src_frame->local; + dst_local = dst_frame->local; - if (fop_frame) { - afr_writev_copy_outvars (frame, fop_frame); - afr_writev_unwind (fop_frame, this); - } - return 0; + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; + if (src_local->xdata_rsp) + dst_local->xdata_rsp = dict_ref(src_local->xdata_rsp); } -static void -afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) +void +afr_writev_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; - local = frame->local; - priv = this->private; - /* - * We already have the best case result of the writev calls staged - * as the return value. Any writev that returns some value less - * than the best case is now out of sync, so mark the fop as - * failed. Note that fops that have returned with errors have - * already been marked as failed. - */ - for (i = 0; i < priv->child_count; i++) { - if ((!local->replies[i].valid) || - (local->replies[i].op_ret == -1)) - continue; + local = frame->local; - if (local->replies[i].op_ret < local->op_ret) - afr_transaction_fop_failed(frame, this, i); - } -} + if (priv->consistent_metadata) + afr_zero_fill_stat(local); -int -afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t * local = NULL; - afr_private_t *priv = NULL; - call_frame_t *fop_frame = NULL; - int child_index = (long) cookie; - int call_count = -1; - int read_child = 0; - int ret = 0; - uint32_t open_fd_count = 0; - uint32_t write_is_append = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - - - /* stage the best case return value for unwind */ - if ((local->success_count == 0) || (op_ret > local->op_ret)) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - - if (op_ret != -1) { - if (xdata) { - ret = dict_get_uint32 (xdata, - GLUSTERFS_OPEN_FD_COUNT, - &open_fd_count); - if ((ret == 0) && - (open_fd_count > local->open_fd_count)) { - local->open_fd_count = open_fd_count; - local->update_open_fd_count = _gf_true; - } - - write_is_append = 0; - ret = dict_get_uint32 (xdata, - GLUSTERFS_WRITE_IS_APPEND, - &write_is_append); - if (ret || !write_is_append) - local->append_write = _gf_false; - } - - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - - if (local->update_open_fd_count) - afr_handle_open_fd_count (frame, this); - - if (!local->stable_write && !local->append_write) - /* An appended write removes the necessity to - fsync() the file. This is because self-heal - has the logic to check for larger file when - the xattrs are not reliably pointing at - a stale file. - */ - afr_fd_report_unstable_write (this, local->fd); - - afr_writev_handle_short_writes (frame, this); - if (afr_any_fops_failed (local, priv)) { - //Don't unwind until post-op is complete - local->transaction.resume (frame, this); - } else { - /* - * Generally inode-write fops do transaction.unwind then - * transaction.resume, but writev needs to make sure that - * delayed post-op frame is placed in fdctx before unwind - * happens. This prevents the race of flush doing the - * changelog wakeup first in fuse thread and then this - * writev placing its delayed post-op frame in fdctx. - * This helps flush make sure all the delayed post-ops are - * completed. - */ - - fop_frame = afr_transaction_detach_fop_frame (frame); - afr_writev_copy_outvars (frame, fop_frame); - local->transaction.resume (frame, this); - afr_writev_unwind (fop_frame, this); - } - } - return 0; + AFR_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); } int -afr_writev_wind (call_frame_t *frame, xlator_t *this) +afr_transaction_writev_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; - dict_t *xdata = NULL; - GF_UNUSED int ret = 0; + call_frame_t *fop_frame = NULL; - local = frame->local; - priv = this->private; + fop_frame = afr_transaction_detach_fop_frame(frame); - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + if (fop_frame) { + afr_writev_copy_outvars(frame, fop_frame); + afr_writev_unwind(fop_frame, this); + } + return 0; +} - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } +static void +afr_writev_handle_short_writes(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } +} - local->call_count = call_count; - local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), - gf_afr_mt_reply_t); - if (!local->replies) { - local->op_ret = -1; - local->op_errno = ENOMEM; - local->transaction.unwind(frame, this); - local->transaction.resume(frame, this); - return 0; - } - - xdata = dict_new (); - if (xdata) { - ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, - sizeof (uint32_t)); - ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, - 0); - /* Set append_write to be true speculatively. If on any - server it turns not be true, we unset it in the - callback. - */ - local->append_write = _gf_true; +void +afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int ret = 0; + afr_local_t *local = frame->local; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; + int32_t num_inodelks = 0; + + LOCK(&frame->lock); + { + __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, + prebuf, postbuf, NULL, xdata); + if (op_ret == -1 || !xdata) + goto unlock; + + write_is_append = 0; + ret = dict_get_uint32(xdata, GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + + ret = dict_get_uint32(xdata, GLUSTERFS_ACTIVE_FD_COUNT, &open_fd_count); + if (ret < 0) + goto unlock; + if (open_fd_count > local->open_fd_count) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; } - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - local->fd, - local->cont.writev.vector, - local->cont.writev.count, - local->cont.writev.offset, - local->cont.writev.flags, - local->cont.writev.iobref, - xdata); - - if (!--call_count) - break; - } + ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT, + &num_inodelks); + if (ret < 0) + goto unlock; + if (num_inodelks > local->num_inodelks) { + local->num_inodelks = num_inodelks; + local->update_num_inodelks = _gf_true; } - - if (xdata) - dict_unref (xdata); - - return 0; + } +unlock: + UNLOCK(&frame->lock); } - -int -afr_writev_done (call_frame_t *frame, xlator_t *this) +void +afr_process_post_writev(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; - local = frame->local; + local = frame->local; - iobref_unref (local->cont.writev.iobref); - local->cont.writev.iobref = NULL; + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write(this, local); - local->transaction.unwind (frame, this); + __afr_inode_write_finalize(frame, this); - AFR_STACK_DESTROY (frame); + afr_writev_handle_short_writes(frame, this); - return 0; + if (local->update_open_fd_count) + local->inode_ctx->open_fd_count = local->open_fd_count; + if (local->update_num_inodelks && + local->transaction.type == AFR_DATA_TRANSACTION) { + lock = &local->inode_ctx->lock[local->transaction.type]; + lock->num_inodelks = local->num_inodelks; + } } - int -afr_do_writev (call_frame_t *frame, xlator_t *this) +afr_writev_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - call_frame_t *transaction_frame = NULL; - afr_local_t *local = NULL; - int op_ret = -1; - int op_errno = 0; - - local = frame->local; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - transaction_frame->local = local; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + call_frame_t *fop_frame = NULL; + int child_index = (long)cookie; + int call_count = -1; - local->op = GF_FOP_WRITE; + afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf, + postbuf, xdata); - local->success_count = 0; + call_count = afr_frame_return(frame); - local->transaction.fop = afr_writev_wind; - local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_transaction_writev_unwind; + if (call_count == 0) { + afr_process_post_writev(frame, this); - local->transaction.main_frame = frame; - if (local->fd->flags & O_APPEND) { - /* - * Backend vfs ignores the 'offset' for append mode fd so - * locking just the region provided for the writev does not - * give consistency gurantee. The actual write may happen at a - * completely different range than the one provided by the - * offset, len in the fop. So lock the entire file. - */ - local->transaction.start = 0; - local->transaction.len = 0; + if (!afr_txn_nothing_failed(frame, this)) { + // Don't unwind until post-op is complete + afr_transaction_resume(frame, this); } else { - local->transaction.start = local->cont.writev.offset; - local->transaction.len = iov_length (local->cont.writev.vector, - local->cont.writev.count); - } - - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); - } - - return 0; + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame(frame); + afr_writev_copy_outvars(frame, fop_frame); + afr_transaction_resume(frame, this); + afr_writev_unwind(fop_frame, this); + } + } + return 0; } -static void -afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) -{ - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - char *reason = NULL; - int32_t op_errno = 0; - int ret = 0; - - if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " - "fd: %p, inode: %p", fd, - fd ? fd->inode : NULL); - goto out; - } - - frame = create_frame (this, this->ctx->pool); - if (!frame) - goto out; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, this->private, &op_errno); - if (ret < 0) - goto out; +static int +afr_arbiter_writev_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + static char byte = 0xFF; + static struct iovec vector = {&byte, 1}; + int32_t count = 1; - local->loc.inode = inode_ref (fd->inode); - ret = loc_path (&local->loc, NULL); - if (ret < 0) - goto out; + STACK_WIND_COOKIE( + frame, afr_writev_wind_cbk, (void *)(long)subvol, + priv->children[subvol], priv->children[subvol]->fops->writev, local->fd, + &vector, count, local->cont.writev.offset, local->cont.writev.flags, + local->cont.writev.iobref, local->xdata_req); - sh = &local->self_heal; - sh->do_metadata_self_heal = _gf_true; - if (fd->inode->ia_type == IA_IFREG) - sh->do_data_self_heal = _gf_true; - else if (fd->inode->ia_type == IA_IFDIR) - sh->do_entry_self_heal = _gf_true; - - reason = "subvolume came online"; - afr_launch_self_heal (frame, this, fd->inode, _gf_true, - fd->inode->ia_type, reason, NULL, NULL); - return; -out: - AFR_STACK_DESTROY (frame); + return 0; } -void -afr_open_fd_fix (fd_t *fd, xlator_t *this) +int +afr_writev_wind(call_frame_t *frame, xlator_t *this, int subvol) { - int ret = 0; - int i = 0; - afr_fd_ctx_t *fd_ctx = NULL; - gf_boolean_t need_self_heal = _gf_false; - int *need_open = NULL; - size_t need_open_count = 0; - afr_private_t *priv = NULL; - - priv = this->private; - - if (!afr_is_fd_fixable (fd)) - goto out; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; - - LOCK (&fd->lock); - { - if (fd_ctx->up_count < priv->up_count) { - need_self_heal = _gf_true; - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - } - - need_open = alloca (priv->child_count * sizeof (*need_open)); - for (i = 0; i < priv->child_count; i++) { - need_open[i] = 0; - if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) - continue; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (!priv->child_up[i]) - continue; + local = frame->local; + priv = this->private; - fd_ctx->opened_on[i] = AFR_FD_OPENING; - - need_open[i] = 1; - need_open_count++; - } - } - UNLOCK (&fd->lock); - if (ret) - goto out; - - if (need_self_heal) - afr_trigger_open_fd_self_heal (fd, this); - - if (!need_open_count) - goto out; + if (AFR_IS_ARBITER_BRICK(priv, subvol)) { + afr_arbiter_writev_wind(frame, this, subvol); + return 0; + } - afr_fix_open (this, fd, need_open_count, need_open); -out: - return; + STACK_WIND_COOKIE(frame, afr_writev_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->writev, local->fd, + local->cont.writev.vector, local->cont.writev.count, + local->cont.writev.offset, local->cont.writev.flags, + local->cont.writev.iobref, local->xdata_req); + return 0; } int -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) +afr_do_writev(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int ret = -1; - int op_errno = 0; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - priv = this->private; + local = frame->local; + transaction_frame->local = local; + frame->local = NULL; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + if (!AFR_FRAME_INIT(frame, op_errno)) + goto out; - QUORUM_CHECK(writev,out); + local->op = GF_FOP_WRITE; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local->transaction.wind = afr_writev_wind; + local->transaction.unwind = afr_transaction_writev_unwind; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->cont.writev.vector = iov_dup (vector, count); - local->cont.writev.count = count; - local->cont.writev.offset = offset; - local->cont.writev.flags = flags; - local->cont.writev.iobref = iobref_ref (iobref); - - local->fd = fd_ref (fd); + local->transaction.main_frame = frame; - /* detect here, but set it in writev_wind_cbk *after* the unstable - write is performed - */ - local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - - afr_open_fd_fix (fd, this); + if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency guarantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = local->cont.writev.offset; + local->transaction.len = iov_length(local->cont.writev.vector, + local->cont.writev.count); + } + + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - afr_do_writev (frame, this); + AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} - ret = 0; +int +afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = ENOMEM; + int ret = -1; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->cont.writev.vector = iov_dup(vector, count); + if (!local->cont.writev.vector) + goto out; + local->cont.writev.count = count; + local->cont.writev.offset = offset; + local->cont.writev.flags = flags; + local->cont.writev.iobref = iobref_ref(iobref); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; + + if (dict_set_uint32(local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) { + op_errno = ENOMEM; + goto out; + } + + if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT, + this->name)) { + op_errno = ENOMEM; + goto out; + } + + if (dict_set_uint32(local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; + goto out; + } + + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags | flags) & (O_SYNC | O_DSYNC)); + + afr_fix_open(fd, this); + + afr_do_writev(frame, this); + + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } - /* }}} */ /* {{{ truncate */ int -afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +afr_truncate_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - -int -afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t * local = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - - local = frame->local; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (op_ret != -1) { - if (prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->stable_write && afr_txn_nothing_failed (frame, this)) - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(truncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } - -int32_t -afr_truncate_wind (call_frame_t *frame, xlator_t *this) +int +afr_truncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + afr_local_t *local = NULL; - local->call_count = call_count; - local->stable_write = _gf_true; + local = frame->local; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->truncate, - &local->loc, - local->cont.truncate.offset, - NULL); - - if (!--call_count) - break; - } - } + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - return 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - int -afr_truncate_done (call_frame_t *frame, xlator_t *this) +afr_truncate_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; + local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_truncate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->truncate, &local->loc, + local->cont.truncate.offset, local->xdata_req); + return 0; } - int -afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset, dict_t *xdata) +afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - priv = this->private; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - QUORUM_CHECK(truncate,out); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } + local->cont.truncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; + if (!local->xdata_req) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local->transaction.wind = afr_truncate_wind; + local->transaction.unwind = afr_truncate_unwind; - local->cont.truncate.offset = offset; + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local->transaction.fop = afr_truncate_wind; - local->transaction.done = afr_truncate_done; - local->transaction.unwind = afr_truncate_unwind; + local->op = GF_FOP_TRUNCATE; - loc_copy (&local->loc, loc); + local->transaction.main_frame = frame; + local->transaction.start = offset; + local->transaction.len = 0; - local->transaction.main_frame = frame; - local->transaction.start = offset; - local->transaction.len = 0; + /* Set it true speculatively, will get reset in afr_truncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - /* }}} */ /* {{{ ftruncate */ - int -afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - - -int -afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t * local = NULL; - int child_index = (long) cookie; - int call_count = -1; - int read_child = 0; - - local = frame->local; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (op_ret != -1) { - if (prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->stable_write && afr_txn_nothing_failed (frame, this)) - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(ftruncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } - int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - local->stable_write = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - local->fd, - local->cont.ftruncate.offset, - NULL); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - int -afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +afr_ftruncate_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - - local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + local = frame->local; + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_ftruncate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->ftruncate, local->fd, + local->cont.ftruncate.offset, local->xdata_req); + return 0; } - int -afr_do_ftruncate (call_frame_t *frame, xlator_t *this) +afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; - - local = frame->local; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } - - transaction_frame->local = local; - frame->local = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local->op = GF_FOP_FTRUNCATE; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local->transaction.fop = afr_ftruncate_wind; - local->transaction.done = afr_ftruncate_done; - local->transaction.unwind = afr_ftruncate_unwind; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->transaction.main_frame = frame; + local->cont.ftruncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - local->transaction.start = local->cont.ftruncate.offset; - local->transaction.len = 0; + if (!local->xdata_req) + goto out; - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, - NULL, NULL); - } + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - return 0; -} - - -int -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(ftruncate,out); + local->op = GF_FOP_FTRUNCATE; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local->transaction.wind = afr_ftruncate_wind; + local->transaction.unwind = afr_ftruncate_unwind; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local->transaction.main_frame = frame; - local->cont.ftruncate.offset = offset; + local->transaction.start = local->cont.ftruncate.offset; + local->transaction.len = 0; - local->fd = fd_ref (fd); + afr_fix_open(fd, this); - afr_open_fd_fix (fd, this); + /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - afr_do_ftruncate (frame, this); + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - } + AFR_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } /* }}} */ @@ -1007,1605 +788,1778 @@ out: /* {{{ setattr */ int -afr_setattr_unwind (call_frame_t *frame, xlator_t *this) +afr_setattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - - -int -afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, preop, postop, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(setattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } - -int32_t -afr_setattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_setattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preop, + struct iatt *postop, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, - &local->cont.setattr.in_buf, - local->cont.setattr.valid, - NULL); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop, + postop, NULL, xdata); } - int -afr_setattr_done (call_frame_t *frame, xlator_t *this) +afr_setattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - - local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + local = frame->local; + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_setattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->setattr, &local->loc, + &local->cont.setattr.in_buf, local->cont.setattr.valid, + local->xdata_req); + return 0; } - int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) +afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - priv = this->private; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - QUORUM_CHECK(setattr,out); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } + local->cont.setattr.in_buf = *buf; + local->cont.setattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; + if (!local->xdata_req) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local->transaction.wind = afr_setattr_wind; + local->transaction.unwind = afr_setattr_unwind; - local->cont.setattr.in_buf = *buf; - local->cont.setattr.valid = valid; + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local->transaction.fop = afr_setattr_wind; - local->transaction.done = afr_setattr_done; - local->transaction.unwind = afr_setattr_unwind; + local->op = GF_FOP_SETATTR; - loc_copy (&local->loc, loc); + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* {{{ fsetattr */ int -afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fsetattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fsetattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) +afr_fsetattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preop, + struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop, + postop, NULL, xdata); +} - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); +int +afr_fsetattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + local = frame->local; + priv = this->private; - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, preop, postop, - xdata); + STACK_WIND_COOKIE(frame, afr_fsetattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetattr, local->fd, + &local->cont.fsetattr.in_buf, local->cont.fsetattr.valid, + local->xdata_req); + return 0; +} - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); +int +afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, + int32_t valid, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if (need_unwind) - local->transaction.unwind (frame, this); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - call_count = afr_frame_return (frame); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local->cont.fsetattr.in_buf = *buf; + local->cont.fsetattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - return 0; -} + if (!local->xdata_req) + goto out; + local->transaction.wind = afr_fsetattr_wind; + local->transaction.unwind = afr_fsetattr_unwind; -int32_t -afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local = frame->local; - priv = this->private; + local->op = GF_FOP_FSETATTR; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + afr_fix_open(fd, this); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - local->call_count = call_count; + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetattr, - local->fd, - &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid, - NULL); - - if (!--call_count) - break; - } - } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } +/* {{{ setxattr */ int -afr_fsetattr_done (call_frame_t *frame, xlator_t *this) +afr_setxattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - local->transaction.unwind (frame, this); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - AFR_STACK_DESTROY (frame); + AFR_STACK_UNWIND(setxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} - return 0; +int +afr_setxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); } int -afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) +afr_setxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; + priv = this->private; - priv = this->private; + STACK_WIND_COOKIE(frame, afr_setxattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->setxattr, &local->loc, + local->cont.setxattr.dict, local->cont.setxattr.flags, + local->xdata_req); + return 0; +} - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } +int +afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) - QUORUM_CHECK(fsetattr,out); +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i, ret = 0; + char *op_type = NULL; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } + local = frame->local; + priv = this->private; + i = (long)cookie; - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + ret = dict_get_str_sizen(local->xdata_req, "replicate-brick-op", &op_type); + if (ret) + goto out; - local->cont.fsetattr.in_buf = *buf; - local->cont.fsetattr.valid = valid; + gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, + op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s", + priv->children[i]->name, "op_ret=%s", + op_ret ? "failed" : "succeeded", NULL); + +out: + syncbarrier_wake(&local->barrier); + return 0; +} - local->transaction.fop = afr_fsetattr_wind; - local->transaction.done = afr_fsetattr_done; - local->transaction.unwind = afr_fsetattr_unwind; +int +afr_emptyb_set_pending_changelog(call_frame_t *frame, xlator_t *this, + unsigned char *locked_nodes) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int ret = 0, i = 0; - local->fd = fd_ref (fd); + local = frame->local; + priv = this->private; - afr_open_fd_fix (fd, this); + AFR_ONLIST(locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk, + xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, local->xattr_req, + NULL); - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + /* It is sufficient if xattrop was successful on one child */ + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; + if (local->replies[i].op_ret == 0) { + ret = 0; goto out; + } else { + ret = afr_higher_errno(ret, local->replies[i].op_errno); } - - ret = 0; + } out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); - } - - return 0; + return -ret; } +static int +_afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc, + int empty_index, afr_transaction_type type, + char *op_type, const int op_type_len) +{ + int count = 0; + int ret = -ENOMEM; + int idx = -1; + int d_idx = -1; + unsigned char *locked_nodes = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + locked_nodes = alloca0(priv->child_count); + + idx = afr_index_for_transaction_type(type); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + local->pending[empty_index][idx] = hton32(1); + + if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION)) + local->pending[empty_index][d_idx] = hton32(1); + + local->xdata_req = dict_new(); + if (!local->xdata_req) + goto out; + + ret = dict_set_nstrn(local->xdata_req, "replicate-brick-op", + SLEN("replicate-brick-op"), op_type, op_type_len); + if (ret) + goto out; + + local->xattr_req = dict_new(); + if (!local->xattr_req) + goto out; + + ret = afr_set_pending_dict(priv, local->xattr_req, local->pending); + if (ret < 0) + goto out; + + if (AFR_ENTRY_TRANSACTION == type) { + count = afr_selfheal_entrylk(frame, this, loc->inode, this->name, NULL, + locked_nodes); + } else { + count = afr_selfheal_inodelk(frame, this, loc->inode, this->name, + LLONG_MAX - 1, 0, locked_nodes); + } + + if (!count) { + gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS, + NULL); + ret = -EAGAIN; + goto unlock; + } + + ret = afr_emptyb_set_pending_changelog(frame, this, locked_nodes); + if (ret) + goto unlock; + ret = 0; +unlock: + if (AFR_ENTRY_TRANSACTION == type) { + afr_selfheal_unentrylk(frame, this, loc->inode, this->name, NULL, + locked_nodes, NULL); + } else { + afr_selfheal_uninodelk(frame, this, loc->inode, this->name, + LLONG_MAX - 1, 0, locked_nodes); + } +out: + return ret; +} -/* {{{ setxattr */ +void +afr_brick_args_cleanup(void *opaque) +{ + afr_empty_brick_args_t *data = NULL; + data = opaque; + loc_wipe(&data->loc); + GF_FREE(data); +} int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +_afr_handle_empty_brick_cbk(int ret, call_frame_t *frame, void *opaque) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + afr_brick_args_cleanup(opaque); + return 0; +} - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } - return 0; +int +_afr_handle_empty_brick(void *opaque) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int empty_index = -1; + int ret = -1; + int op_errno = ENOMEM; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + char *op_type = NULL; + int op_type_len = 0; + afr_empty_brick_args_t *data = NULL; + call_frame_t *op_frame = NULL; + + data = opaque; + frame = data->frame; + empty_index = data->empty_index; + if (!data->op_type) + goto out; + + op_frame = copy_frame(frame); + if (!op_frame) { + ret = -1; + op_errno = ENOMEM; + goto out; + } + + op_type = data->op_type; + op_type_len = strlen(op_type); + this = op_frame->this; + priv = this->private; + + afr_set_lk_owner(op_frame, this, op_frame->root); + local = AFR_FRAME_INIT(op_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, &data->loc); + + gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s", + priv->children[empty_index]->name, NULL); + + ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, + AFR_METADATA_TRANSACTION, op_type, + op_type_len); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + dict_unref(local->xdata_req); + dict_unref(local->xattr_req); + afr_matrix_cleanup(local->pending, priv->child_count); + local->pending = NULL; + local->xattr_req = NULL; + local->xdata_req = NULL; + + ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, + AFR_ENTRY_TRANSACTION, op_type, + op_type_len); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = 0; +out: + if (op_frame) { + AFR_STACK_DESTROY(op_frame); + } + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + return 0; } +int +afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc, + char *data) +{ + afr_local_t *local = NULL; + int ret = -1; + int op_errno = EINVAL; + + local = frame->local; + local->xdata_req = dict_new(); + + if (!local->xdata_req) { + op_errno = ENOMEM; + goto out; + } + + ret = dict_set_int32_sizen(local->xdata_req, "heal-op", + GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = dict_set_str_sizen(local->xdata_req, "child-name", data); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + /* set spb choice to -1 whether heal succeeds or not: + * If heal succeeds : spb-choice should be set to -1 as + * it is no longer valid; file is not + * in split-brain anymore. + * If heal doesn't succeed: + * spb-choice should be set to -1 + * otherwise reads will be served + * from spb-choice which is misleading. + */ + ret = afr_inode_split_brain_choice_set(loc->inode, this, -1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + NULL); + afr_heal_splitbrain_file(frame, this, loc); + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + return 0; +} int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); + int spb_child_index = -1; + char *spb_child_str = NULL; - if (need_unwind) - local->transaction.unwind (frame, this); + spb_child_str = alloca0(len + 1); + memcpy(spb_child_str, value, len); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (!strcmp(spb_child_str, "none")) + return -2; - return 0; + spb_child_index = afr_get_child_index_from_name(this, spb_child_str); + if (spb_child_index < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "subvol=%s", spb_child_str, NULL); + } + return spb_child_index; } - int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +afr_can_set_split_brain_choice(void *opaque) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + afr_spbc_timeout_t *data = opaque; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + int ret = -1; - local = frame->local; - priv = this->private; + frame = data->frame; + loc = data->loc; + this = frame->this; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + ret = afr_is_split_brain(frame, this, loc->inode, loc->gfid, &data->d_spb, + &data->m_spb); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s", + uuid_utoa(loc->gfid), NULL); + return ret; +} - local->call_count = call_count; +int +afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) +{ + void *choice_value = NULL; + void *resolve_value = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_spbc_timeout_t *data = NULL; + int len = 0; + int spb_child_index = -1; + int ret = -1; + int op_errno = EINVAL; + + priv = this->private; + + ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len); + ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value, + &len); + if (!choice_value && !resolve_value) { + ret = -1; + goto out; + } + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) { + ret = 1; + goto out; + } + + local->op = GF_FOP_SETXATTR; + + if (choice_value) { + spb_child_index = afr_get_split_brain_child_index(this, choice_value, + len); + if (spb_child_index < 0) { + /* Case where value was "none" */ + if (spb_child_index == -2) + spb_child_index = -1; + else { + ret = 1; + op_errno = EINVAL; + goto out; + } + } - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags, - NULL); - - if (!--call_count) - break; - } + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spbc_timeout_t); + if (!data) { + ret = 1; + goto out; + } + data->spb_child_index = spb_child_index; + data->frame = frame; + loc_copy(&local->loc, loc); + data->loc = &local->loc; + ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice, + afr_set_split_brain_choice, NULL, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + "name=%s", loc->name, NULL); + ret = 1; + op_errno = ENOMEM; + goto out; + } + ret = 0; + goto out; + } + + if (resolve_value) { + spb_child_index = afr_get_split_brain_child_index(this, resolve_value, + len); + if (spb_child_index < 0) { + ret = 1; + goto out; } - return 0; + afr_split_brain_resolve_do(frame, this, loc, + priv->children[spb_child_index]->name); + ret = 0; + } +out: + /* key was correct but value was invalid when ret == 1 */ + if (ret == 1) { + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + if (data) + GF_FREE(data); + ret = 0; + } + return ret; } - int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) +afr_handle_spb_choice_timeout(xlator_t *this, call_frame_t *frame, dict_t *dict) { - afr_local_t *local = frame->local; + int ret = -1; + int op_errno = 0; + uint64_t timeout = 0; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + priv = this->private; - AFR_STACK_DESTROY (frame); + ret = dict_get_uint64(dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout); + if (!ret) { + priv->spb_choice_timeout = timeout * 60; + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + } - return 0; + return ret; } int -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = EINVAL; - - VALIDATE_OR_GOTO (this, out); - - GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, - op_errno, out); - - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, - op_errno, out); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - QUORUM_CHECK(setxattr,out); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; +afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) +{ + int ret = -1; + int ab_ret = -1; + int empty_index = -1; + int op_errno = EPERM; + char *empty_brick = NULL; + char *op_type = NULL; + afr_empty_brick_args_t *data = NULL; + + ret = dict_get_str_sizen(dict, GF_AFR_REPLACE_BRICK, &empty_brick); + if (!ret) + op_type = GF_AFR_REPLACE_BRICK; + + ab_ret = dict_get_str_sizen(dict, GF_AFR_ADD_BRICK, &empty_brick); + if (!ab_ret) + op_type = GF_AFR_ADD_BRICK; + + if (ret && ab_ret) + goto out; + + if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) { + gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR, + "op_type=%s", op_type, NULL); + ret = 1; + goto out; + } + empty_index = afr_get_child_index_from_name(this, empty_brick); + + if (empty_index < 0) { + /* Didn't belong to this replica pair + * Just do a no-op + */ + AFR_STACK_UNWIND(setxattr, frame, 0, 0, NULL); + return 0; + } else { + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_empty_brick_t); + if (!data) { + ret = 1; + op_errno = ENOMEM; + goto out; } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; - - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; - - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; - - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; + data->frame = frame; + loc_copy(&data->loc, loc); + data->empty_index = empty_index; + data->op_type = op_type; + ret = synctask_new(this->ctx->env, _afr_handle_empty_brick, + _afr_handle_empty_brick_cbk, NULL, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + NULL); + ret = 1; + op_errno = ENOMEM; + afr_brick_args_cleanup(data); goto out; } - - ret = 0; + } + ret = 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - } - - return 0; + if (ret == 1) { + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + ret = 0; + } + return ret; } -/* {{{ fsetxattr */ - - -int -afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) +static int +afr_handle_special_xattr(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) { - afr_local_t *local = NULL; - call_frame_t *main_frame = NULL; + int ret = -1; - local = frame->local; + ret = afr_handle_split_brain_commands(this, frame, loc, dict); + if (ret == 0) + goto out; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + ret = afr_handle_spb_choice_timeout(this, frame, dict); + if (ret == 0) + goto out; - if (main_frame) { - AFR_STACK_UNWIND (fsetxattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } - return 0; + /* Applicable for replace-brick and add-brick commands */ + ret = afr_handle_empty_brick(this, frame, loc, dict); +out: + return ret; } - int -afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = EINVAL; - local = frame->local; - priv = this->private; + GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out); - LOCK (&frame->lock); - { + GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); + ret = afr_handle_special_xattr(this, frame, loc, dict); + if (ret == 0) + return 0; - if (need_unwind) - local->transaction.unwind (frame, this); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - call_count = afr_frame_return (frame); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local->cont.setxattr.dict = dict_ref(dict); + local->cont.setxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - return 0; -} + if (!local->xdata_req) + goto out; + local->transaction.wind = afr_setxattr_wind; + local->transaction.unwind = afr_setxattr_unwind; -int -afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local = frame->local; - priv = this->private; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + local->op = GF_FOP_SETXATTR; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - local->call_count = call_count; + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetxattr, - local->fd, - local->cont.fsetxattr.dict, - local->cont.fsetxattr.flags, - NULL); - - if (!--call_count) - break; - } - } + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } +/* {{{ fsetxattr */ int -afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; + + AFR_STACK_UNWIND(fsetxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } int -afr_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) +afr_fsetxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = EINVAL; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +int +afr_fsetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, - op_errno, out); + local = frame->local; + priv = this->private; - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, - op_errno, out); + STACK_WIND_COOKIE(frame, afr_fsetxattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetxattr, local->fd, + local->cont.fsetxattr.dict, local->cont.fsetxattr.flags, + local->xdata_req); + return 0; +} - priv = this->private; +int +afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out); - QUORUM_CHECK(fsetxattr,out); + GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); - AFR_LOCAL_ALLOC_OR_GOTO (local, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + local->cont.fsetxattr.dict = dict_ref(dict); + local->cont.fsetxattr.flags = flags; - transaction_frame->local = local; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - local->op_ret = -1; + if (!local->xdata_req) + goto out; - local->cont.fsetxattr.dict = dict_ref (dict); - local->cont.fsetxattr.flags = flags; + local->transaction.wind = afr_fsetxattr_wind; + local->transaction.unwind = afr_fsetxattr_unwind; - local->transaction.fop = afr_fsetxattr_wind; - local->transaction.done = afr_fsetxattr_done; - local->transaction.unwind = afr_fsetxattr_unwind; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local->fd = fd_ref (fd); + local->op = GF_FOP_FSETXATTR; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); + return 0; } /* }}} */ - /* {{{ removexattr */ - int -afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +afr_removexattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + local = frame->local; - if (main_frame) { - AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} - - -int -afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - return 0; + AFR_STACK_UNWIND(removexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } - -int32_t -afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_removexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, - local->cont.removexattr.name, - NULL); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); } - int -afr_removexattr_done (call_frame_t *frame, xlator_t *this) +afr_removexattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; - return 0; + STACK_WIND_COOKIE(frame, afr_removexattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->removexattr, &local->loc, + local->cont.removexattr.name, local->xdata_req); + return 0; } - int -afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - VALIDATE_OR_GOTO (this, out); + GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out); - GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", - name, op_errno, out); + GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); - GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", - name, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - priv = this->private; + local->cont.removexattr.name = gf_strdup(name); - QUORUM_CHECK(removexattr,out); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } + if (!local->xdata_req) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; + local->transaction.wind = afr_removexattr_wind; + local->transaction.unwind = afr_removexattr_unwind; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->cont.removexattr.name = gf_strdup (name); - - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; - local->transaction.unwind = afr_removexattr_unwind; + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - loc_copy (&local->loc, loc); + local->op = GF_FOP_REMOVEXATTR; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); + return 0; } /* ffremovexattr */ int -afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fremovexattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + local = frame->local; - if (main_frame) { - AFR_STACK_UNWIND (fremovexattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fremovexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} int -afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +afr_fremovexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); +} + +int +afr_fremovexattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); + STACK_WIND_COOKIE(frame, afr_fremovexattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fremovexattr, local->fd, + local->cont.removexattr.name, local->xdata_req); + return 0; +} - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); +int +afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if (need_unwind) - local->transaction.unwind (frame, this); + GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out); - call_count = afr_frame_return (frame); + GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); - if (call_count == 0) { - local->transaction.resume (frame, this); - } + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + local->cont.removexattr.name = gf_strdup(name); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); -int32_t -afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + if (!local->xdata_req) + goto out; - local = frame->local; - priv = this->private; + local->transaction.wind = afr_fremovexattr_wind; + local->transaction.unwind = afr_fremovexattr_unwind; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local->op = GF_FOP_FREMOVEXATTR; - local->call_count = call_count; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fremovexattr, - local->fd, - local->cont.removexattr.name, - NULL); - - if (!--call_count) - break; - } - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; -} + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); + return 0; +} int -afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) +afr_fallocate_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fallocate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) +afr_fallocate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} - VALIDATE_OR_GOTO (this, out); +int +afr_fallocate_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", - name, op_errno, out); + local = frame->local; + priv = this->private; - GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", - name, op_errno, out); + STACK_WIND_COOKIE(frame, afr_fallocate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fallocate, local->fd, + local->cont.fallocate.mode, local->cont.fallocate.offset, + local->cont.fallocate.len, local->xdata_req); + return 0; +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); +int +afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - QUORUM_CHECK(fremovexattr, out); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; - AFR_LOCAL_ALLOC_OR_GOTO (local, out); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - transaction_frame->local = local; + if (!local->xdata_req) + goto out; - local->op_ret = -1; + local->op = GF_FOP_FALLOCATE; - local->cont.removexattr.name = gf_strdup (name); + local->transaction.wind = afr_fallocate_wind; + local->transaction.unwind = afr_fallocate_unwind; - local->transaction.fop = afr_fremovexattr_wind; - local->transaction.done = afr_fremovexattr_done; - local->transaction.unwind = afr_fremovexattr_unwind; + local->transaction.main_frame = frame; - local->fd = fd_ref (fd); + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + afr_fix_open(fd, this); - op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } -static int -afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +/* }}} */ + +/* {{{ discard */ + +int +afr_discard_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} -static int -afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); + AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} - if (need_unwind) - local->transaction.unwind (frame, this); +int +afr_discard_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} - call_count = afr_frame_return (frame); +int +afr_discard_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local = frame->local; + priv = this->private; - return 0; + STACK_WIND_COOKIE(frame, afr_discard_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->discard, local->fd, + local->cont.discard.offset, local->cont.discard.len, + local->xdata_req); + return 0; } -static int -afr_fallocate_wind (call_frame_t *frame, xlator_t *this) +int +afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; - priv = this->private; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local->cont.discard.offset = offset; + local->cont.discard.len = len; - local->call_count = call_count; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fallocate, - local->fd, - local->cont.fallocate.mode, - local->cont.fallocate.offset, - local->cont.fallocate.len, - NULL); - - if (!--call_count) - break; - } - } + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - return 0; -} + if (!local->xdata_req) + goto out; -static int -afr_fallocate_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; + local->op = GF_FOP_DISCARD; - local = frame->local; + local->transaction.wind = afr_discard_wind; + local->transaction.unwind = afr_discard_unwind; - local->transaction.unwind (frame, this); + local->transaction.main_frame = frame; - AFR_STACK_DESTROY (frame); + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; - return 0; -} + afr_fix_open(fd, this); -static int -afr_do_fallocate (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - local = frame->local; + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + AFR_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} - transaction_frame->local = local; - frame->local = NULL; +/* {{{ zerofill */ - local->op = GF_FOP_FALLOCATE; +int +afr_zerofill_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local->transaction.fop = afr_fallocate_wind; - local->transaction.done = afr_fallocate_done; - local->transaction.unwind = afr_fallocate_unwind; + local = frame->local; - local->transaction.main_frame = frame; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - local->transaction.start = local->cont.fallocate.offset; - local->transaction.len = 0; + AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} - /* fallocate can modify the file size */ - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } +int +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, - NULL, NULL); - } +int +afr_zerofill_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - return 0; + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_zerofill_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->zerofill, local->fd, + local->cont.zerofill.offset, local->cont.zerofill.len, + local->xdata_req); + return 0; } int -afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - priv = this->private; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(fallocate,out); + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - local->cont.fallocate.mode = mode; - local->cont.fallocate.offset = offset; - local->cont.fallocate.len = len; + if (!local->xdata_req) + goto out; - local->fd = fd_ref (fd); + local->op = GF_FOP_ZEROFILL; - afr_open_fd_fix (fd, this); + local->transaction.wind = afr_zerofill_wind; + local->transaction.unwind = afr_zerofill_unwind; - afr_do_fallocate (frame, this); + local->transaction.main_frame = frame; - ret = 0; + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = len; + + afr_fix_open(fd, this); + + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ -/* {{{ discard */ +int32_t +afr_xattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, xattr, xdata); +} -static int -afr_discard_unwind (call_frame_t *frame, xlator_t *this) +int +afr_xattrop_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (discard, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } - return 0; + STACK_WIND_COOKIE(frame, afr_xattrop_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->xattrop, &local->loc, + local->cont.xattrop.optype, local->cont.xattrop.xattr, + local->xdata_req); + return 0; } -static int -afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +int +afr_xattrop_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); + AFR_STACK_UNWIND(xattrop, main_frame, local->op_ret, local->op_errno, + local->xattr_rsp, local->xdata_rsp); + return 0; +} - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); +int32_t +afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if (need_unwind) - local->transaction.unwind (frame, this); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - call_count = afr_frame_return (frame); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local->cont.xattrop.xattr = dict_ref(xattr); + local->cont.xattrop.optype = optype; + if (xdata) + local->xdata_req = dict_ref(xdata); - return 0; -} + local->transaction.wind = afr_xattrop_wind; + local->transaction.unwind = afr_xattrop_unwind; -static int -afr_discard_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local = frame->local; - priv = this->private; + local->op = GF_FOP_XATTROP; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - local->call_count = call_count; + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->discard, - local->fd, - local->cont.discard.offset, - local->cont.discard.len, - NULL); - - if (!--call_count) - break; - } - } + AFR_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); + return 0; +} - return 0; +int32_t +afr_fxattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, xattr, xdata); } -static int -afr_discard_done (call_frame_t *frame, xlator_t *this) +int +afr_fxattrop_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; + local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + STACK_WIND_COOKIE(frame, afr_fxattrop_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fxattrop, local->fd, + local->cont.xattrop.optype, local->cont.xattrop.xattr, + local->xdata_req); + return 0; +} - AFR_STACK_DESTROY (frame); +int +afr_fxattrop_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; + + AFR_STACK_UNWIND(fxattrop, main_frame, local->op_ret, local->op_errno, + local->xattr_rsp, local->xdata_rsp); + return 0; } -static int -afr_do_discard (call_frame_t *frame, xlator_t *this) +int32_t +afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - transaction_frame->local = local; - frame->local = NULL; + local->cont.xattrop.xattr = dict_ref(xattr); + local->cont.xattrop.optype = optype; + if (xdata) + local->xdata_req = dict_ref(xdata); - local->op = GF_FOP_DISCARD; + local->transaction.wind = afr_fxattrop_wind; + local->transaction.unwind = afr_fxattrop_unwind; - local->transaction.fop = afr_discard_wind; - local->transaction.done = afr_discard_done; - local->transaction.unwind = afr_discard_unwind; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local->transaction.main_frame = frame; + local->op = GF_FOP_FXATTROP; - local->transaction.start = local->cont.discard.offset; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, - NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; } int -afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) +afr_fsync_unwind(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - priv = this->private; + local = frame->local; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(discard, out); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + AFR_STACK_UNWIND(fsync, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + return 0; +} - local->cont.discard.offset = offset; - local->cont.discard.len = len; +int +afr_fsync_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} - local->fd = fd_ref (fd); +int +afr_fsync_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - afr_open_fd_fix (fd, this); + local = frame->local; + priv = this->private; - afr_do_discard(frame, this); + STACK_WIND_COOKIE(frame, afr_fsync_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsync, local->fd, + local->cont.fsync.datasync, local->xdata_req); + return 0; +} - ret = 0; +int +afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int32_t op_errno = ENOMEM; + int8_t last_fsync = 0; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + if (xdata) { + local->xdata_req = dict_copy_with_ref(xdata, NULL); + if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) { + if (last_fsync) { + local->transaction.disable_delayed_post_op = _gf_true; + } + } + } else { + local->xdata_req = dict_new(); + } + + if (!local->xdata_req) + goto out; + + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; + + local->op = GF_FOP_FSYNC; + local->cont.fsync.datasync = datasync; + + if (afr_fd_has_witnessed_unstable_write(this, fd->inode)) { + /* don't care. we only wanted to CLEAR the bit */ + } + + local->transaction.wind = afr_fsync_wind; + local->transaction.unwind = afr_fsync_unwind; + + local->transaction.main_frame = frame; + + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index 883faae6cb5..a787069b7a1 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -12,67 +12,83 @@ #define __INODE_WRITE_H__ int32_t -afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *xdata); +afr_chmod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dict_t *xdata); int32_t -afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata); +afr_chown(call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, gid_t gid, + dict_t *xdata); int -afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata); +afr_fchown(call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid, + dict_t *xdata); int32_t -afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode, dict_t *xdata); +afr_fchmod(call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode, + dict_t *xdata); int32_t -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata); +afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata); int32_t -afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset, dict_t *xdata); +afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata); int32_t -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdata); +afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata); int32_t -afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2], dict_t *xdata); +afr_utimens(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct timespec tv[2], dict_t *xdata); int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata); +afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata); int -afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata); +afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, + int32_t valid, dict_t *xdata); int32_t -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); +afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); int32_t -afr_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); +afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata); int32_t -afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata); +afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); int32_t -afr_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata); +afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); int -afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata); +afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); int -afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata); +afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata); + +int32_t +afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); + +int32_t +afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); + +int +afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index e2136089bdc..bc8eabe0f43 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -8,2163 +8,784 @@ cases as published by the Free Software Foundation. */ -#include "dict.h" -#include "byte-order.h" -#include "common-utils.h" +#include <glusterfs/dict.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/common-utils.h> #include "afr.h" #include "afr-transaction.h" +#include "afr-messages.h" #include <signal.h> - -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ -#define LOCKED_LOWER 0x2 /* for lower path */ - -#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ - do { \ - afr_private_t *_priv = this->private; \ - if (!_priv->inodelk_trace) \ - break; \ - afr_trace_inodelk_in (frame, this, params); \ - } while (0); - -#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ - do { \ - afr_private_t *_priv = this->private; \ - if (!_priv->inodelk_trace) \ - break; \ - afr_trace_inodelk_out (frame, this, params); \ - } while (0); - -#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ - do { \ - afr_private_t *_priv = this->private; \ - if (!_priv->entrylk_trace) \ - break; \ - afr_trace_entrylk_in (frame, this, params); \ - } while (0); - -#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ - do { \ - afr_private_t *_priv = this->private; \ - if (!_priv->entrylk_trace) \ - break; \ - afr_trace_entrylk_out (frame, this, params); \ - } while (0); - -int -afr_entry_lockee_cmp (const void *l1, const void *l2) -{ - const afr_entry_lockee_t *r1 = l1; - const afr_entry_lockee_t *r2 = l2; - int ret = 0; - uuid_t gfid1 = {0}; - uuid_t gfid2 = {0}; - - loc_gfid ((loc_t*)&r1->loc, gfid1); - loc_gfid ((loc_t*)&r2->loc, gfid2); - ret = uuid_compare (gfid1, gfid2); - /*Entrylks with NULL basename are the 'smallest'*/ - if (ret == 0) { - if (!r1->basename) - return -1; - if (!r2->basename) - return 1; - ret = strcmp (r1->basename, r2->basename); - } - - if (ret <= 0) - return -1; - else - return 1; -} - -int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); - -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); - -static uint64_t afr_lock_number = 1; - -static uint64_t -get_afr_lock_number () -{ - return (++afr_lock_number); -} - -int -afr_set_lock_number (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_number = get_afr_lock_number (); - - return 0; -} +#define LOCKED_NO 0x0 /* no lock held */ +#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ +#define LOCKED_LOWER 0x2 /* for lower path */ void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) -{ - gf_log (this->name, GF_LOG_TRACE, - "Setting lk-owner=%llu", - (unsigned long long) (unsigned long)lk_owner); - - set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); -} - -static int -is_afr_lock_selfheal (afr_local_t *local) +afr_lockee_cleanup(afr_lockee_t *lockee) { - afr_internal_lock_t *int_lock = NULL; - int ret = -1; - - int_lock = &local->internal_lock; - - switch (int_lock->selfheal_lk_type) { - case AFR_DATA_SELF_HEAL_LK: - case AFR_METADATA_SELF_HEAL_LK: - ret = 1; - break; - case AFR_ENTRY_SELF_HEAL_LK: - ret = 0; - break; - } + if (lockee->fd) { + fd_unref(lockee->fd); + lockee->fd = NULL; + } else { + loc_wipe(&lockee->loc); + } - return ret; + GF_FREE(lockee->basename); + lockee->basename = NULL; + GF_FREE(lockee->locked_nodes); + lockee->locked_nodes = NULL; -} - -int32_t -internal_lock_count (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t call_count = 0; - int i = 0; - - local = frame->local; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - ++call_count; - } - - return call_count; -} - -static void -afr_print_inodelk (char *str, int size, int cmd, - struct gf_flock *flock, gf_lkowner_t *owner) -{ - char *cmd_str = NULL; - char *type_str = NULL; - - switch (cmd) { -#if F_GETLK != F_GETLK64 - case F_GETLK64: -#endif - case F_GETLK: - cmd_str = "GETLK"; - break; - -#if F_SETLK != F_SETLK64 - case F_SETLK64: -#endif - case F_SETLK: - cmd_str = "SETLK"; - break; - -#if F_SETLKW != F_SETLKW64 - case F_SETLKW64: -#endif - case F_SETLKW: - cmd_str = "SETLKW"; - break; - - default: - cmd_str = "<null>"; - break; - } - - switch (flock->l_type) { - case F_RDLCK: - type_str = "READ"; - break; - case F_WRLCK: - type_str = "WRITE"; - break; - case F_UNLCK: - type_str = "UNLOCK"; - break; - default: - type_str = "UNKNOWN"; - break; - } - - snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%s", - cmd_str, type_str, (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner)); - -} - -static void -afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, - int child_index) -{ - snprintf (str, size, "path=%s, fd=%p, child=%d", - loc->path ? loc->path : "<nul>", - fd ? fd : NULL, - child_index); + return; } void -afr_print_entrylk (char *str, int size, const char *basename, - gf_lkowner_t *owner) -{ - snprintf (str, size, "Basename=%s, lk-owner=%s", - basename ? basename : "<nul>", - lkowner_utoa (owner)); -} - -static void -afr_print_verdict (int op_ret, int op_errno, char *str) +afr_lockees_cleanup(afr_internal_lock_t *int_lock) { - if (op_ret < 0) { - if (op_errno == EAGAIN) - strcpy (str, "EAGAIN"); - else - strcpy (str, "FAILED"); - } - else - strcpy (str, "GRANTED"); -} + int i = 0; -static void -afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, - char *lock_call_type_str, - afr_internal_lock_t *int_lock) -{ - switch (lock_call_type) { - case AFR_INODELK_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL"); - break; - case AFR_INODELK_NB_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL"); - break; - case AFR_ENTRYLK_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL"); - break; - case AFR_ENTRYLK_NB_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL"); - break; - default: - strcpy (lock_call_type_str, "UNKNOWN"); - break; - } + for (i = 0; i < int_lock->lockee_count; i++) { + afr_lockee_cleanup(&int_lock->lockee[i]); + } + return; } - -static void -afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, - afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, struct gf_flock *flock, - int op_ret, int op_errno, int32_t child_index) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - char lockee[256]; - char lock_call_type_str[256]; - char verdict[16]; - - local = frame->local; - int_lock = &local->internal_lock; - - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - afr_print_verdict (op_ret, op_errno, verdict); - - gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, lkowner_utoa (&frame->root->lk_owner), lockee, - (unsigned long long) int_lock->lock_number); - -} - -static void -afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, - afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, struct gf_flock *flock, - int32_t cmd, int32_t child_index) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - - local = frame->local; - int_lock = &local->internal_lock; - - afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - gf_log (this->name, GF_LOG_INFO, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", - lock, lockee, - (unsigned long long) int_lock->lock_number); - +int +afr_entry_lockee_cmp(const void *l1, const void *l2) +{ + const afr_lockee_t *r1 = l1; + const afr_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid((loc_t *)&r1->loc, gfid1); + loc_gfid((loc_t *)&r2->loc, gfid2); + ret = gf_uuid_compare(gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp(r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; } -static void -afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, - afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, - int32_t cookie) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - int child_index = 0; - int lockee_no = 0; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->entrylk_trace) { - return; - } - lockee_no = cookie / priv->child_count; - child_index = cookie % priv->child_count; - - afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); - afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, - child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - gf_log (this->name, GF_LOG_INFO, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", - lock, lockee, - (unsigned long long) int_lock->lock_number, - cookie); -} +int +afr_lock_blocking(call_frame_t *frame, xlator_t *this, int child_index); -static void -afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, - afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, - int op_ret, int op_errno, int32_t cookie) +void +afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int lockee_no = 0; - int child_index = 0; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - char verdict[16]; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->entrylk_trace) { - return; - } - lockee_no = cookie / priv->child_count; - child_index = cookie % priv->child_count; - - afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); - afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, - child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - afr_print_verdict (op_ret, op_errno, verdict); - - gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, - lock, lockee, - (unsigned long long) int_lock->lock_number, - cookie); + gf_msg_trace(this->name, 0, "Setting lk-owner=%llu", + (unsigned long long)(unsigned long)lk_owner); + set_lk_owner_from_ptr(&frame->root->lk_owner, lk_owner); } -static int -transaction_lk_op (afr_local_t *local) +int32_t +internal_lock_count(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int32_t call_count = 0; + int i = 0; - int_lock = &local->internal_lock; + local = frame->local; + priv = this->private; - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) { - gf_log (THIS->name, GF_LOG_DEBUG, - "lk op is for a transaction"); - ret = 1; - } - else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) { - gf_log (THIS->name, GF_LOG_DEBUG, - "lk op is for a self heal"); - - ret = 0; - } - - if (ret == -1) - gf_log (THIS->name, GF_LOG_DEBUG, - "lk op is not set"); - - return ret; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; + } + return call_count; } -static int -is_afr_lock_transaction (afr_local_t *local) +int +afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename, + int child_count) { - int ret = 0; + int ret = -ENOMEM; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count]; - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - ret = 1; - break; + GF_ASSERT(int_lock->lockee_count < AFR_LOCKEE_COUNT_MAX); + loc_copy(&lockee->loc, loc); + lockee->basename = (basename) ? gf_strdup(basename) : NULL; + if (basename && !lockee->basename) + goto out; - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - ret = 0; - break; + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes), + gf_afr_mt_afr_node_character); - } + if (!lockee->locked_nodes) + goto out; - return ret; + ret = 0; + int_lock->lockee_count++; +out: + if (ret) { + afr_lockee_cleanup(lockee); + } + return ret; } int -afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, - loc_t *loc, char *basename, int child_count) +afr_add_inode_lockee(afr_local_t *local, int child_count) { - int ret = -1; + int ret = -ENOMEM; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count]; - loc_copy (&lockee->loc, loc); - lockee->basename = (basename)? gf_strdup (basename): NULL; - if (basename && !lockee->basename) - goto out; + if (local->fd) { + lockee->fd = fd_ref(local->fd); + } else { + loc_copy(&lockee->loc, &local->loc); + } - lockee->locked_count = 0; - lockee->locked_nodes = GF_CALLOC (child_count, - sizeof (*lockee->locked_nodes), - gf_afr_mt_afr_node_character); + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes), + gf_afr_mt_afr_node_character); - if (!lockee->locked_nodes) - goto out; + if (!lockee->locked_nodes) + goto out; - ret = 0; + ret = 0; + int_lock->lockee_count++; out: - return ret; - -} - -void -afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) -{ - int i = 0; - - for (i = 0; i < int_lock->lockee_count; i++) { - loc_wipe (&int_lock->lockee[i].loc); - if (int_lock->lockee[i].basename) - GF_FREE (int_lock->lockee[i].basename); - if (int_lock->lockee[i].locked_nodes) - GF_FREE (int_lock->lockee[i].locked_nodes); - } - - return; + if (ret) { + afr_lockee_cleanup(lockee); + } + return ret; } static int -initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) +initialize_internal_lock_variables(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - - int i = 0; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->entrylk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; - - for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { - if (!int_lock->lockee[i].locked_nodes) - break; - int_lock->lockee[i].locked_count = 0; - memset (int_lock->lockee[i].locked_nodes, 0, - sizeof (*int_lock->lockee[i].locked_nodes) * - priv->child_count); - } + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; - return 0; -} + int i = 0; -static int -initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_inodelk_t *inodelk = NULL; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - inodelk = afr_get_inodelk (int_lock, int_lock->domain); + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; - inodelk->lock_count = 0; - int_lock->lk_attempted_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + int_lock->lock_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + int_lock->lk_attempted_count = 0; - memset (inodelk->locked_nodes, 0, - sizeof (*inodelk->locked_nodes) * priv->child_count); - memset (int_lock->locked_nodes, 0, - sizeof (*int_lock->locked_nodes) * priv->child_count); + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset(int_lock->lockee[i].locked_nodes, 0, + sizeof(*int_lock->lockee[i].locked_nodes) * priv->child_count); + } - return 0; -} - -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) -{ - int ret = 0; - - ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); - - if (ret == 0) - ret = strcmp (b1, b2); - - if (ret <= 0) - return l1; - else - return l2; + return 0; } int -afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) +afr_lockee_locked_nodes_count(afr_internal_lock_t *int_lock) { - int call_count = 0; - int i = 0; + int call_count = 0; + int i = 0; - for (i = 0; i < int_lock->lockee_count; i++) - call_count += int_lock->lockee[i].locked_count; + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; - return call_count; + return call_count; } int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) - -{ - int i = 0; - int call_count = 0; - - for (i = 0; i < child_count; i++) { - if (locked_nodes[i] & LOCKED_YES) - call_count++; - } - - return call_count; -} - -/* FIXME: What if UNLOCK fails */ -static int32_t -afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - int call_count = 0; - - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - gf_log (this->name, GF_LOG_TRACE, - "All internal locks unlocked"); - int_lock->lock_cbk (frame, this); - } - - return 0; -} - -static int32_t -afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - int32_t child_index = (long)cookie; - - local = frame->local; - int_lock = &local->internal_lock; - - AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, child_index); - - if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on %d " - "unlock by %s", local->loc.path, child_index, - lkowner_utoa (&frame->root->lk_owner)); - } - - - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - inodelk->locked_nodes[child_index] &= LOCKED_NO; - if (local->transaction.eager_lock) - local->transaction.eager_lock[child_index] = 0; - - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata); - - return 0; - -} - -static int -afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - struct gf_flock flock = {0,}; - struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = NULL; - int call_count = 0; - int i = 0; - int piggyback = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; - flock.l_type = F_UNLCK; - - full_flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (inodelk->locked_nodes, - priv->child_count); - - int_lock->lk_call_count = call_count; - - if (!call_count) { - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } - - if (local->fd) - fd_ctx = afr_fd_ctx_get (local->fd, this); - - for (i = 0; i < priv->child_count; i++) { - if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) - continue; - - if (local->fd) { - flock_use = &flock; - if (!local->transaction.eager_lock[i]) { - goto wind; - } - - piggyback = 0; - - LOCK (&local->fd->lock); - { - if (fd_ctx->lock_piggyback[i]) { - fd_ctx->lock_piggyback[i]--; - piggyback = 1; - } else { - fd_ctx->lock_acquired[i]--; - } - } - UNLOCK (&local->fd->lock); - - if (piggyback) { - afr_unlock_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0, NULL); - if (!--call_count) - break; - continue; - } - - flock_use = &full_flock; - wind: - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, flock_use, F_SETLK, - i); - - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->finodelk, - int_lock->domain, local->fd, - F_SETLK, flock_use, NULL); - - if (!--call_count) - break; - - } else { - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->inodelk, - int_lock->domain, &local->loc, - F_SETLK, &flock, NULL); - - if (!--call_count) - break; - } - } -out: - return 0; -} - -static int32_t -afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; - int32_t child_index = 0; - int lockee_no = 0; - - priv = this->private; - lockee_no = (int)((long) cookie) / priv->child_count; - child_index = (int) ((long) cookie) % priv->child_count; - - local = frame->local; - int_lock = &local->internal_lock; - - AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_UNLOCK_OP, - int_lock->lockee[lockee_no].basename, op_ret, - op_errno, (int) ((long)cookie)); +afr_locked_nodes_count(unsigned char *locked_nodes, int child_count) - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unlock failed on %d, reason: %s", - local->loc.path, child_index, strerror (op_errno)); - } - - int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); - - return 0; -} - -static int -afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int index = 0; - int lockee_no = 0; - int copies = 0; - int i = -1; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - copies = priv->child_count; - - call_count = afr_lockee_locked_nodes_count (int_lock); - - int_lock->lk_call_count = call_count; - - if (!call_count){ - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } + int i = 0; + int call_count = 0; - for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { - lockee_no = i / copies; - index = i % copies; - if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, - int_lock->lockee[lockee_no].basename, - i); - - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, - (void *) (long) i, - priv->children[index], - priv->children[index]->fops->entrylk, - int_lock->domain, - &int_lock->lockee[lockee_no].loc, - int_lock->lockee[lockee_no].basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - - if (!--call_count) - break; - } - } - -out: - return 0; + for (i = 0; i < child_count; i++) { + if (locked_nodes[i] & LOCKED_YES) + call_count++; + } + return call_count; } -static int32_t -afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int cky = (long) cookie; - int child_index = 0; - int lockee_no = 0; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - child_index = ((int)cky) % priv->child_count; - lockee_no = ((int)cky) / priv->child_count; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - } - - local->op_errno = op_errno; - int_lock->lock_op_errno = op_errno; - } - - int_lock->lk_attempted_count++; - } - UNLOCK (&frame->lock); - - if ((op_ret == -1) && - (op_errno == ENOSYS)) { - afr_unlock (frame, this); - } else { - if (op_ret == 0) { - if (local->transaction.type == AFR_ENTRY_TRANSACTION || - local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; - int_lock->lockee[lockee_no].locked_count++; - int_lock->entrylk_lock_count++; - } else { - int_lock->locked_nodes[child_index] |= LOCKED_YES; - int_lock->lock_count++; - } - } - afr_lock_blocking (frame, this, cky + 1); - } - - return 0; -} - -static int32_t -afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long) cookie); - - afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); - return 0; - -} - -static int32_t -afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long)cookie); - - afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); - return 0; -} - -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) +static void +afr_log_locks_failure(call_frame_t *frame, char *where, char *what, + int op_errno) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + xlator_t *this = frame->this; + gf_lkowner_t *lk_owner = &frame->root->lk_owner; + afr_local_t *local = frame->local; + const char *fop = NULL; + char *gfid = NULL; + const char *name = NULL; - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - memcpy (inodelk->locked_nodes, int_lock->locked_nodes, - sizeof (*inodelk->locked_nodes) * priv->child_count); - inodelk->lock_count = int_lock->lock_count; - break; + fop = gf_fop_list[local->op]; + switch (local->transaction.type) { case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - /*entrylk_count is being used in both non-blocking and blocking - * modes */ - break; - } - - return 0; - + switch (local->op) { + case GF_FOP_LINK: + gfid = uuid_utoa(local->newloc.pargfid); + name = local->newloc.name; + break; + default: + gfid = uuid_utoa(local->loc.pargfid); + name = local->loc.name; + break; + } + gf_msg(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_INTERNAL_LKS_FAILED, + "Unable to do entry %s with lk-owner:%s on %s " + "while attempting %s on {pgfid:%s, name:%s}.", + what, lkowner_utoa(lk_owner), where, fop, gfid, name); + break; + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + gfid = uuid_utoa(local->inode->gfid); + gf_msg(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_INTERNAL_LKS_FAILED, + "Unable to do inode %s with lk-owner:%s on %s " + "while attempting %s on gfid:%s.", + what, lkowner_utoa(lk_owner), where, fop, gfid); + break; + } } -static inline gf_boolean_t -afr_is_entrylk (afr_internal_lock_t *int_lock, - afr_transaction_type trans_type) +static int32_t +afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_boolean_t is_entrylk = _gf_false; - - if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && - int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int lockee_num = 0; + int call_count = 0; + int child_index = 0; + int ret = 0; - is_entrylk = _gf_true; + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + lockee_num = (int)((long)cookie) / priv->child_count; + child_index = (int)((long)cookie) % priv->child_count; - } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && - (trans_type == AFR_ENTRY_TRANSACTION || - trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + afr_log_locks_failure(frame, priv->children[child_index]->name, + "unlock", op_errno); + } - is_entrylk = _gf_true; + int_lock->lockee[lockee_num].locked_nodes[child_index] &= LOCKED_NO; + if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1) + ret = afr_write_subvol_reset(frame, this); - } else { - is_entrylk = _gf_false; - } - - return is_entrylk; -} + LOCK(&frame->lock); + { + call_count = --int_lock->lk_call_count; + } + UNLOCK(&frame->lock); -static gf_boolean_t -_is_lock_wind_needed (afr_local_t *local, int child_index) -{ - if (!local->child_up[child_index]) - return _gf_false; + if (call_count == 0) { + int_lock->lock_cbk(frame, this); + } - return _gf_true; + return ret; } -int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) -{ - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - struct gf_flock flock = {0,}; - uint64_t ctx = 0; - int ret = 0; - int child_index = 0; - int lockee_no = 0; - gf_boolean_t is_entrylk = _gf_false; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - child_index = cookie % priv->child_count; - lockee_no = cookie / priv->child_count; - is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - - - if (!is_entrylk) { - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; - flock.l_type = inodelk->flock.l_type; - } - - if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - - afr_copy_locked_nodes (frame, this); - - afr_unlock (frame, this); - - return 0; - } - } - - if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { - if ((is_entrylk && int_lock->entrylk_lock_count == 0) || - (!is_entrylk && int_lock->lock_count == 0)) { - gf_log (this->name, GF_LOG_INFO, - "unable to lock on even one child"); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - - afr_copy_locked_nodes (frame, this); - - afr_unlock(frame, this); - - return 0; - } - } - - if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { - /* we're done locking */ - - gf_log (this->name, GF_LOG_DEBUG, - "we're done locking"); - - afr_copy_locked_nodes (frame, this); - - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - return 0; - } - - if (!_is_lock_wind_needed (local, child_index)) { - afr_lock_blocking (frame, this, cookie + 1); - return 0; - } - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - - if (local->fd) { - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLKW, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->finodelk, - int_lock->domain, local->fd, - F_SETLKW, &flock, NULL); - - } else { - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLKW, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->inodelk, - int_lock->domain, &local->loc, - F_SETLKW, &flock, NULL); - } - - break; - - case AFR_ENTRY_RENAME_TRANSACTION: +void +afr_internal_lock_wind(call_frame_t *frame, + int32_t (*cbk)(call_frame_t *, void *, xlator_t *, + int32_t, int32_t, dict_t *), + void *cookie, int child, int lockee_num, + gf_boolean_t blocking, gf_boolean_t unlock) +{ + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_internal_lock_t *int_lock = &local->internal_lock; + entrylk_cmd cmd = ENTRYLK_LOCK_NB; + int32_t cmd1 = F_SETLK; + struct gf_flock flock = { + 0, + }; + + switch (local->transaction.type) { case AFR_ENTRY_TRANSACTION: - /*Accounting for child_index increments on 'down' - *and 'fd-less' children */ - - if (local->fd) { - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, - int_lock->lockee[lockee_no].basename, - cookie); - - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) cookie, - priv->children[child_index], - priv->children[child_index]->fops->fentrylk, - int_lock->domain, local->fd, - int_lock->lockee[lockee_no].basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); - } else { - AFR_TRACE_ENTRYLK_IN (frame, this, - AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) cookie, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - int_lock->domain, - &int_lock->lockee[lockee_no].loc, - int_lock->lockee[lockee_no].basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); - } - - break; - } - - return 0; -} - -int32_t -afr_blocking_lock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int up_count = 0; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + case AFR_ENTRY_RENAME_TRANSACTION: + if (unlock) { + cmd = ENTRYLK_UNLOCK; + } else if (blocking) { /*Doesn't make sense to have blocking + unlock*/ + cmd = ENTRYLK_LOCK; + } + + if (local->fd) { + STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->fentrylk, + int_lock->domain, + int_lock->lockee[lockee_num].fd, + int_lock->lockee[lockee_num].basename, cmd, + ENTRYLK_WRLCK, NULL); + } else { + STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_num].loc, + int_lock->lockee[lockee_num].basename, cmd, + ENTRYLK_WRLCK, NULL); + } + break; - switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - initialize_inodelk_variables (frame, this); - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - up_count = afr_up_children_count (local->child_up, - priv->child_count); - int_lock->lk_call_count = int_lock->lk_expected_count - = (int_lock->lockee_count * - up_count); - initialize_entrylk_variables (frame, this); - break; - } - - afr_lock_blocking (frame, this, 0); - - return 0; + flock = int_lock->lockee[lockee_num].flock; + if (unlock) { + flock.l_type = F_UNLCK; + } else if (blocking) { /*Doesn't make sense to have blocking + unlock*/ + cmd1 = F_SETLKW; + } + + if (local->fd) { + STACK_WIND_COOKIE( + frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->finodelk, int_lock->domain, + int_lock->lockee[lockee_num].fd, cmd1, &flock, NULL); + } else { + STACK_WIND_COOKIE( + frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->inodelk, int_lock->domain, + &int_lock->lockee[lockee_num].loc, cmd1, &flock, NULL); + } + break; + } } -static int32_t -afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - int call_count = 0; - int child_index = (long) cookie; - int copies = 0; - int index = 0; - int lockee_no = 0; - afr_private_t *priv = NULL; - - priv = this->private; - - copies = priv->child_count; - index = child_index % copies; - lockee_no = child_index / copies; - - local = frame->local; - int_lock = &local->internal_lock; - - AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, - int_lock->lockee[lockee_no].basename, op_ret, - op_errno, (long) cookie); - - LOCK (&frame->lock); - { - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->lockee[lockee_no].locked_nodes[index] |= \ - LOCKED_YES; - int_lock->lockee[lockee_no].locked_count++; - int_lock->entrylk_lock_count++; - } - - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - gf_log (this->name, GF_LOG_TRACE, - "Last locking reply received"); - /* all locks successful. Proceed to call FOP */ - if (int_lock->entrylk_lock_count == - int_lock->lk_expected_count) { - gf_log (this->name, GF_LOG_TRACE, - "All servers locked. Calling the cbk"); - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - } - /* Not all locks were successful. Unlock and try locking - again, this time with serially blocking locks */ - else { - gf_log (this->name, GF_LOG_TRACE, - "%d servers locked. Trying again with blocking calls", - int_lock->lock_count); - - afr_unlock(frame, this); - } +static int +afr_unlock_now(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + int lockee_num = 0; + int i = -1; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + call_count = afr_lockee_locked_nodes_count(int_lock); + + int_lock->lk_call_count = call_count; + + if (!call_count) { + gf_msg_trace(this->name, 0, "No internal locks unlocked"); + int_lock->lock_cbk(frame, this); + goto out; + } + + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_num = i / priv->child_count; + child_index = i % priv->child_count; + if (int_lock->lockee[lockee_num].locked_nodes[child_index] & + LOCKED_YES) { + afr_internal_lock_wind(frame, afr_unlock_common_cbk, + (void *)(long)i, child_index, lockee_num, + _gf_false, _gf_true); + if (!--call_count) + break; } + } - return 0; -} - -int -afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int copies = 0; - int index = 0; - int lockee_no = 0; - int32_t call_count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - copies = priv->child_count; - initialize_entrylk_variables (frame, this); - - if (local->fd) { - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_INFO, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EINVAL; - int_lock->lock_op_errno = EINVAL; - - afr_unlock (frame, this); - return -1; - } - - call_count = int_lock->lockee_count * internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - if (!call_count) { - gf_log (this->name, GF_LOG_INFO, - "fd not open on any subvolumes. aborting."); - afr_unlock (frame, this); - goto out; - } - - /* Send non-blocking entrylk calls only on up children - and where the fd has been opened */ - for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { - index = i%copies; - lockee_no = i/copies; - if (local->child_up[index]) { - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, - int_lock->lockee[lockee_no].basename, - i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, - (void *) (long) i, - priv->children[index], - priv->children[index]->fops->fentrylk, - this->name, local->fd, - int_lock->lockee[lockee_no].basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, - NULL); - if (!--call_count) - break; - } - } - } else { - call_count = int_lock->lockee_count * internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { - index = i%copies; - lockee_no = i/copies; - if (local->child_up[index]) { - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, - int_lock->lockee[lockee_no].basename, - i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, - (void *) (long) i, - priv->children[index], - priv->children[index]->fops->entrylk, - this->name, &int_lock->lockee[lockee_no].loc, - int_lock->lockee[lockee_no].basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, - NULL); - - if (!--call_count) - break; - } - } - } out: - return 0; + return 0; } -int32_t -afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - int call_count = 0; - int child_index = (long) cookie; - afr_fd_ctx_t *fd_ctx = NULL; - - - local = frame->local; - int_lock = &local->internal_lock; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long) cookie); - - if (local->fd) - fd_ctx = afr_fd_ctx_get (local->fd, this); - - LOCK (&frame->lock); - { - if (op_ret < 0) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on " - "server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - if (local->transaction.eager_lock) - local->transaction.eager_lock[child_index] = 0; - } else { - inodelk->locked_nodes[child_index] |= LOCKED_YES; - inodelk->lock_count++; - - if (local->transaction.eager_lock && - local->transaction.eager_lock[child_index] && - local->fd) { - /* piggybacked */ - if (op_ret == 1) { - /* piggybacked */ - } else if (op_ret == 0) { - /* lock acquired from server */ - fd_ctx->lock_acquired[child_index]++; - } - } - } - - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - gf_log (this->name, GF_LOG_TRACE, - "Last inode locking reply received"); - /* all locks successful. Proceed to call FOP */ - if (inodelk->lock_count == int_lock->lk_expected_count) { - gf_log (this->name, GF_LOG_TRACE, - "All servers locked. Calling the cbk"); - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - } - /* Not all locks were successful. Unlock and try locking - again, this time with serially blocking locks */ - else { - gf_log (this->name, GF_LOG_TRACE, - "%d servers locked. Trying again with blocking calls", - int_lock->lock_count); - - afr_unlock(frame, this); +static int32_t +afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long)cookie; + int child_index = 0; + int lockee_num = 0; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + + child_index = ((int)cky) % priv->child_count; + lockee_num = ((int)cky) / priv->child_count; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_msg(this->name, GF_LOG_ERROR, ENOSYS, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + } + + local->op_errno = op_errno; + int_lock->lock_op_errno = op_errno; + } + + int_lock->lk_attempted_count++; + } + UNLOCK(&frame->lock); + + if ((op_ret == -1) && (op_errno == ENOSYS)) { + afr_unlock_now(frame, this); + } else { + if (op_ret == 0) { + int_lock->lockee[lockee_num] + .locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_num].locked_count++; + int_lock->lock_count++; + if (local->transaction.type == AFR_DATA_TRANSACTION) { + LOCK(&local->inode->lock); + { + local->inode_ctx->lock_count++; } + UNLOCK(&local->inode->lock); + } } + afr_lock_blocking(frame, this, cky + 1); + } - return 0; + return 0; } -int -afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +_is_lock_wind_needed(afr_local_t *local, int child_index) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int32_t call_count = 0; - int i = 0; - int ret = 0; - struct gf_flock flock = {0,}; - struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = NULL; - int piggyback = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; - flock.l_type = inodelk->flock.l_type; - - full_flock.l_type = inodelk->flock.l_type; - - initialize_inodelk_variables (frame, this); - - if (local->fd) { - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_INFO, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EINVAL; - int_lock->lock_op_errno = EINVAL; - - afr_unlock (frame, this); - ret = -1; - goto out; - } - - call_count = internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; + if (!local->child_up[child_index]) + return _gf_false; - if (!call_count) { - gf_log (this->name, GF_LOG_INFO, - "fd not open on any subvolumes. aborting."); - afr_unlock (frame, this); - goto out; - } - - /* Send non-blocking inodelk calls only on up children - and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - - flock_use = &flock; - if (!local->transaction.eager_lock_on) { - goto wind; - } - - piggyback = 0; - local->transaction.eager_lock[i] = 1; - - afr_set_delayed_post_op (frame, this); - - LOCK (&local->fd->lock); - { - if (fd_ctx->lock_acquired[i]) { - fd_ctx->lock_piggyback[i]++; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); - - if (piggyback) { - /* (op_ret == 1) => indicate piggybacked lock */ - afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0, NULL); - if (!--call_count) - break; - continue; - } - flock_use = &full_flock; - wind: - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, flock_use, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->finodelk, - int_lock->domain, local->fd, - F_SETLK, flock_use, NULL); - - if (!--call_count) - break; - } - } else { - call_count = internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - int_lock->domain, &local->loc, - F_SETLK, &flock, NULL); - - if (!--call_count) - break; - } - } -out: - return ret; + return _gf_true; } -int32_t -afr_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - if (transaction_lk_op (local)) { - if (is_afr_lock_transaction (local)) - afr_unlock_inodelk (frame, this); - else - afr_unlock_entrylk (frame, this); - - } else { - if (is_afr_lock_selfheal (local)) - afr_unlock_inodelk (frame, this); - else - afr_unlock_entrylk (frame, this); +static gf_boolean_t +is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int child = 0; + int nlockee = 0; + int lockee_count = 0; + gf_boolean_t ret = _gf_true; + + local = frame->local; + priv = this->private; + int_lock = &local->internal_lock; + lockee_count = int_lock->lockee_count; + + if (int_lock->lock_count == 0) { + afr_log_locks_failure(frame, "any subvolume", "lock", + int_lock->lock_op_errno); + return _gf_false; + } + /* For FOPS that take multiple sets of locks (mkdir, rename), + * there must be at least one brick on which the locks from + * all lock sets were successful. */ + for (child = 0; child < priv->child_count; child++) { + ret = _gf_true; + for (nlockee = 0; nlockee < lockee_count; nlockee++) { + if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES)) + ret = _gf_false; } - - return 0; -} - -int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, - unsigned char *locked_nodes) -{ - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - priv = this->private; - - ret = afr_fd_ctx_set (this, fd); if (ret) - goto out; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "failed to get the fd ctx"); - goto out; - } - fdctx = (afr_fd_ctx_t *) (long) tmp; - - GF_ASSERT (fdctx->locked_on); - - memcpy (fdctx->locked_on, locked_nodes, - priv->child_count); - -out: - return ret; -} - -static int -__is_fd_saved (xlator_t *this, fd_t *fd) -{ - afr_locked_fd_t *locked_fd = NULL; - afr_private_t *priv = NULL; - int found = 0; - - priv = this->private; - - list_for_each_entry (locked_fd, &priv->saved_fds, list) { - if (locked_fd->fd == fd) { - found = 1; - break; - } - } - - return found; -} - -static int -__afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - afr_locked_fd_t *locked_fd = NULL; - int ret = 0; - - priv = this->private; + return ret; + } + if (!ret) + afr_log_locks_failure(frame, "all", "lock", int_lock->lock_op_errno); - locked_fd = GF_CALLOC (1, sizeof (*locked_fd), - gf_afr_mt_locked_fd); - if (!locked_fd) { - ret = -1; - goto out; - } - - locked_fd->fd = fd; - INIT_LIST_HEAD (&locked_fd->list); - - list_add_tail (&locked_fd->list, &priv->saved_fds); - -out: - return ret; + return ret; } int -afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - int ret = 0; - - priv = this->private; - - pthread_mutex_lock (&priv->mutex); - { - if (__is_fd_saved (this, fd)) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p already saved", fd); - goto unlock; - } - - ret = __afr_save_locked_fd (this, fd); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "fd=%p could not be saved", fd); - goto unlock; - } - } -unlock: - pthread_mutex_unlock (&priv->mutex); - - return ret; -} - -static int -afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this) +afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie) { - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + uint64_t ctx = 0; + int ret = 0; + int child_index = 0; + int lockee_num = 0; - local = frame->local; + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_num = cookie / priv->child_count; - locked_fd = local->locked_fd; + if (local->fd) { + ret = fd_ctx_get(local->fd, this, &ctx); - STACK_DESTROY (frame->root); - afr_local_cleanup (local, this); - - afr_save_locked_fd (this, locked_fd->fd); - - return 0; - -} - -static int -afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) -{ - afr_fd_ctx_t *fdctx = NULL; - afr_private_t *priv = NULL; - uint64_t tmp = 0; - int i = 0; - int source_child = -1; - int ret = 0; - - priv = this->private; + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED, + "unable to get fd ctx for fd=%p", local->fd); - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; + local->op_ret = -1; + int_lock->lock_op_ret = -1; - fdctx = (afr_fd_ctx_t *) (long) tmp; + afr_unlock_now(frame, this); - for (i = 0; i < priv->child_count; i++) { - if (fdctx->locked_on[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "Found lock recovery source=%d", i); - source_child = i; - break; - } + return 0; } + } -out: - return source_child; + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if (!is_blocking_locks_count_sufficient(frame, this)) { + local->op_ret = -1; + int_lock->lock_op_ret = -1; -} + afr_unlock_now(frame, this); -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata); -int32_t -afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - local = frame->local; - priv = this->private; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "lock recovery failed"); - goto cleanup; + return 0; } + } - source_child = local->source_child; - - memcpy (&flock, lock, sizeof (*lock)); + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + /* we're done locking */ - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock, NULL); - - return 0; + gf_msg_debug(this->name, 0, "we're done locking"); -cleanup: - afr_lock_recovery_cleanup (frame, this); + int_lock->lock_op_ret = 0; + int_lock->lock_cbk(frame, this); return 0; -} - -int -afr_recover_lock (call_frame_t *frame, xlator_t *this, - struct gf_flock *flock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t lock_recovery_child = 0; - - priv = this->private; - local = frame->local; - - lock_recovery_child = local->lock_recovery_child; - - frame->root->lk_owner = flock->l_owner; - - STACK_WIND_COOKIE (frame, afr_recover_lock_cbk, - (void *) (long) lock_recovery_child, - priv->children[lock_recovery_child], - priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock, NULL); + } + if (!_is_lock_wind_needed(local, child_index)) { + afr_lock_blocking(frame, this, cookie + 1); return 0; -} - -static int -is_afr_lock_eol (struct gf_flock *lock) -{ - int ret = 0; + } - if ((lock->l_type == GF_LK_EOL)) - ret = 1; + afr_internal_lock_wind(frame, afr_lock_cbk, (void *)(long)cookie, + child_index, lockee_num, _gf_true, _gf_false); - return ret; + return 0; } int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Failed to get locks on fd"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Got a lock on fd"); - - if (is_afr_lock_eol (lock)) { - gf_log (this->name, GF_LOG_INFO, - "Reached EOL on locks on fd"); - goto cleanup; - } - - afr_recover_lock (frame, this, lock); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - - return 0; -} - -static int -afr_lock_recovery (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - int ret = 0; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - priv = this->private; - local = frame->local; - - fd = local->fd; - - source_child = afr_get_source_lock_recovery (this, fd); - if (source_child < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not recover locks due to lock " - "split brain"); - ret = -1; - goto out; - } - - local->source_child = source_child; - - /* the flock can be zero filled as we're querying incrementally - the locks held on the fd. - */ - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock, NULL); - -out: - return ret; -} - - -static int -afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) +afr_blocking_lock(call_frame_t *frame, xlator_t *this) { - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int up_count = 0; - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; - fdctx = (afr_fd_ctx_t *) (long) tmp; + up_count = AFR_COUNT(local->child_up, priv->child_count); + int_lock->lk_call_count = int_lock->lk_expected_count = + (int_lock->lockee_count * up_count); + initialize_internal_lock_variables(frame, this); - fdctx->opened_on[child_index] = AFR_FD_OPENED; + afr_lock_blocking(frame, this, 0); -out: - return ret; + return 0; } -int32_t -afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - dict_t *xdata) +static int32_t +afr_nb_internal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - int32_t child_index = (long )cookie; - int ret = 0; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int child_index = 0; + int lockee_num = 0; + afr_private_t *priv = NULL; - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Reopen during lock-recovery failed"); - goto cleanup; - } + priv = this->private; - gf_log (this->name, GF_LOG_DEBUG, - "Open succeeded => proceed to recover locks"); + child_index = ((long)cookie) % priv->child_count; + lockee_num = ((long)cookie) / priv->child_count; - ret = afr_lock_recovery (frame, this); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Lock recovery failed"); - goto cleanup; - } + local = frame->local; + int_lock = &local->internal_lock; - ret = afr_mark_fd_opened (this, fd, child_index); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Marking fd open failed"); - goto cleanup; + if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) { + LOCK(&local->inode->lock); + { + local->inode_ctx->lock_count++; } + UNLOCK(&local->inode->lock); + } - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -static int -afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - uint64_t tmp = 0; - afr_fd_ctx_t *fdctx = NULL; - loc_t loc = {0,}; - int32_t child_index = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - - GF_ASSERT (local && local->fd); - - ret = fd_ctx_get (local->fd, this, &tmp); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get the context of fd", - uuid_utoa (local->fd->inode->gfid)); - fdctx = (afr_fd_ctx_t *) (long) tmp; - /* TODO: instead we should return from the function */ - GF_ASSERT (fdctx); - - child_index = local->lock_recovery_child; - - inode_path (local->fd->inode, NULL, (char **)&loc.path); - loc.name = strrchr (loc.path, '/'); - loc.inode = inode_ref (local->fd->inode); - loc.parent = inode_parent (local->fd->inode, 0, NULL); - - - STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk, - (void *)(long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, NULL); - - return 0; -} - -static int -is_fd_opened (fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, THIS, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - if (fdctx->opened_on[child_index] == AFR_FD_OPENED) - ret = 1; - -out: - return ret; + LOCK(&frame->lock); + { + if (op_ret < 0) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_msg(this->name, GF_LOG_ERROR, ENOSYS, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, + "subvolume does not support " + "locking. please load features/locks" + " xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_num] + .locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_num].locked_count++; + int_lock->lock_count++; + } + + call_count = --int_lock->lk_call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + gf_msg_trace(this->name, 0, "Last locking reply received"); + /* all locks successful. Proceed to call FOP */ + if (int_lock->lock_count == int_lock->lk_expected_count) { + gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk"); + int_lock->lock_op_ret = 0; + int_lock->lock_cbk(frame, this); + } + /* Not all locks were successful. Unlock and try locking + again, this time with serially blocking locks */ + else { + gf_msg_trace(this->name, 0, + "%d servers locked. Trying again " + "with blocking calls", + int_lock->lock_count); + + afr_unlock_now(frame, this); + } + } + + return 0; } int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) -{ - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - int ret = -1; - struct list_head locks_list = {0,}; - int32_t op_errno = 0; - - - priv = this->private; - - if (list_empty (&priv->saved_fds)) - goto out; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - ret = -1; - goto out; - } - - frame->local = local; - - INIT_LIST_HEAD (&locks_list); - - pthread_mutex_lock (&priv->mutex); - { - list_splice_init (&priv->saved_fds, &locks_list); - } - pthread_mutex_unlock (&priv->mutex); - - list_for_each_entry_safe (locked_fd, tmp, - &locks_list, list) { - - list_del_init (&locked_fd->list); - - local->fd = fd_ref (locked_fd->fd); - local->lock_recovery_child = child_index; - local->locked_fd = locked_fd; - - if (!is_fd_opened (locked_fd->fd, child_index)) { - gf_log (this->name, GF_LOG_DEBUG, - "attempting open before lock " - "recovery"); - afr_lock_recovery_preopen (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "attempting lock recovery " - "without a preopen"); - afr_lock_recovery (frame, this); - } +afr_lock_nonblocking(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int child = 0; + int lockee_num = 0; + int32_t call_count = 0; + int i = 0; + int ret = 0; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + initialize_internal_lock_variables(frame, this); + + if (local->fd) { + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED, + "unable to get fd ctx for fd=%p", local->fd); + + local->op_ret = -1; + int_lock->lock_op_ret = -1; + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + + afr_unlock_now(frame, this); + ret = -1; + goto out; + } + } + + call_count = int_lock->lockee_count * internal_lock_count(frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + if (!call_count) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON, + "fd not open on any subvolumes. aborting."); + afr_unlock_now(frame, this); + goto out; + } + + /* Send non-blocking lock calls only on up children + and where the fd has been opened */ + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + child = i % priv->child_count; + lockee_num = i / priv->child_count; + if (local->child_up[child]) { + afr_internal_lock_wind(frame, afr_nb_internal_lock_cbk, + (void *)(long)i, child, lockee_num, + _gf_false, _gf_false); + if (!--call_count) + break; } - + } out: - if ((ret < 0) && frame) - AFR_STACK_DESTROY (frame); - return ret; + return ret; } -int -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, - unsigned int child_count) -{ - afr_local_t *dst_local = NULL; - afr_local_t *src_local = NULL; - afr_internal_lock_t *dst_lock = NULL; - afr_internal_lock_t *src_lock = NULL; - afr_inodelk_t *dst_inodelk = NULL; - afr_inodelk_t *src_inodelk = NULL; - int ret = -1; - - src_local = src->local; - src_lock = &src_local->internal_lock; - src_inodelk = afr_get_inodelk (src_lock, dom); - dst_local = dst->local; - dst_lock = &dst_local->internal_lock; - dst_inodelk = afr_get_inodelk (dst_lock, dom); - if (!dst_inodelk || !src_inodelk) - goto out; - if (src_inodelk->locked_nodes) { - memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, - sizeof (*dst_inodelk->locked_nodes) * child_count); - memset (src_inodelk->locked_nodes, 0, - sizeof (*src_inodelk->locked_nodes) * child_count); - } +int32_t +afr_unlock(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + + local = frame->local; + + if (!local->transaction.eager_lock_on) + goto out; + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + list_del_init(&local->transaction.owner_list); + if (list_empty(&lock->owners) && list_empty(&lock->post_op)) { + local->transaction.do_eager_unlock = _gf_true; + /*TODO: Need to get metadata use on_disk and inherit/uninherit + *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]); + *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]); + */ + GF_ASSERT(lock->release); + } + } + UNLOCK(&local->inode->lock); + if (!local->transaction.do_eager_unlock) { + local->internal_lock.lock_cbk(frame, this); + return 0; + } - dst_lock->transaction_lk_type = src_lock->transaction_lk_type; - dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; - dst_inodelk->lock_count = src_inodelk->lock_count; - src_inodelk->lock_count = 0; - ret = 0; out: - return ret; + afr_unlock_now(frame, this); + return 0; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index e01ab366f4f..816065fb57a 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -8,41 +8,31 @@ cases as published by the Free Software Foundation. */ - #ifndef __AFR_MEM_TYPES_H__ #define __AFR_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_afr_mem_types_ { - gf_afr_mt_iovec = gf_common_mt_end + 1, - gf_afr_mt_afr_fd_ctx_t, - gf_afr_mt_afr_private_t, - gf_afr_mt_int32_t, - gf_afr_mt_char, - gf_afr_mt_xattr_key, - gf_afr_mt_dict_t, - gf_afr_mt_xlator_t, - gf_afr_mt_iatt, - gf_afr_mt_int, - gf_afr_mt_afr_node_character, - gf_afr_mt_sh_diff_loop_state, - gf_afr_mt_uint8_t, - gf_afr_mt_loc_t, - gf_afr_mt_entry_name, - gf_afr_mt_pump_priv, - gf_afr_mt_locked_fd, - gf_afr_mt_inode_ctx_t, - gf_afr_fd_paused_call_t, - gf_afr_mt_crawl_data_t, - gf_afr_mt_brick_pos_t, - gf_afr_mt_shd_bool_t, - gf_afr_mt_shd_timer_t, - gf_afr_mt_shd_event_t, - gf_afr_mt_time_t, - gf_afr_mt_pos_data_t, - gf_afr_mt_reply_t, - gf_afr_mt_end + gf_afr_mt_afr_fd_ctx_t = gf_common_mt_end + 1, + gf_afr_mt_afr_private_t, + gf_afr_mt_int32_t, + gf_afr_mt_char, + gf_afr_mt_xattr_key, + gf_afr_mt_dict_t, + gf_afr_mt_xlator_t, + gf_afr_mt_afr_node_character, + gf_afr_mt_inode_ctx_t, + gf_afr_mt_shd_event_t, + gf_afr_mt_reply_t, + gf_afr_mt_subvol_healer_t, + gf_afr_mt_spbc_timeout_t, + gf_afr_mt_spb_status_t, + gf_afr_mt_empty_brick_t, + gf_afr_mt_child_latency_t, + gf_afr_mt_atomic_t, + gf_afr_mt_lk_heal_info_t, + gf_afr_mt_gf_lock, + gf_afr_mt_end }; #endif - diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h new file mode 100644 index 00000000000..e73fd997765 --- /dev/null +++ b/xlators/cluster/afr/src/afr-messages.h @@ -0,0 +1,167 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _AFR_MESSAGES_H_ +#define _AFR_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID( + AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE, + AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN, + AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL, + AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL, + AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED, + AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS, + AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED, + AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED, + AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO, + AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA, + AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED, + AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED, + AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG, + AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB, + AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED, + AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET, + AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START, + AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT, + AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED, + AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG, + AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED, + AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH, + AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED, + AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA, + SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE, + SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED, + AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED, + AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED, + AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED, + AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN, + AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN, + AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP, + AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR, + AFR_MSG_INTERNAL_ATTR); + +#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed" +#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed" +#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed" +#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed" +#define AFR_MSG_INVALID_ARG_STR "Invalid argument" +#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on " +#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file" +#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict." +#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed." +#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \ + "Failed to populate loc for thin-arbiter" +#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending" +#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping." +#define AFR_MSG_UNKNOWN_SET_STR "Unknown set" +#define AFR_MSG_NO_XL_ID_STR "xl does not have id" +#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on" +#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on" +#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter." +#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output" +#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time" +#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict" +#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \ + "No majority to resolve gfid split brain" +#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected" +#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal" +#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch" +#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \ + "Size policy is not applicable to directories." +#define AFR_MSG_NO_CHILD_SELECTED_STR \ + "No child selected by favorite-child policy" +#define AFR_MSG_INVALID_CHILD_STR "Invalid child" +#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \ + "selected as authentic to resolve conflicting data" +#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick" +#define SNO_DIFF_IN_MTIME_STR "No difference in mtime" +#define SNO_BIGGER_FILE_STR "No bigger file" +#define SALL_BRICKS_UP_TO_RESOLVE_STR \ + "All the bricks should be up to resolve the gfid split brain" +#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock" +#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed" +#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame" +#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop" +#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed" +#define AFR_MSG_FSYNC_FAILED_STR "fsync failed" +#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met" +#define AFR_MSG_FOP_FAILED_STR "Failing Fop" +#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume" +#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling" +#define AFR_MSG_CHILD_MISCONFIGURED_STR \ + "replicate translator needs more than one subvolume defined" +#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads" +#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count" +#define AFR_MSG_UNABLE_TO_FETCH_STR \ + "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \ + "using client translator names" +#define AFR_MSG_NULL_DEREF_STR "possible NULL deref" +#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key" +#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask" +#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up" +#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \ + "Failed to cancel split-brain choice" +#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \ + "Cannot set replica. File is not in data/metadata split-brain" +#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx" +#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols" +#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child" +#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file" +#define AFR_MSG_TIMER_CREATE_FAIL_STR \ + "Cannot create timer for delayed initialization" +#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online" +#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \ + "All subvolumes are down. Going offline until atleast one of them is up" +#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock" +#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume" +#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met" +#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir" +#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid" +#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file" +#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain" +#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down" +#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask" +#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \ + "read subvolume in this generation is not up" +#define AFR_MSG_INTERNAL_LKS_FAILED_STR \ + "Unable to work with lk-owner while attempting fop" +#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \ + "subvolume does not support locking. please load features/locks xlator " \ + "on server." +#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx" +#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting." +#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child." +#define AFR_MSG_NEW_BRICK_STR "New brick" +#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \ + "Failed to set split-brain choice to -1" +#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \ + "Failed to determine split-brain. Aborting split-brain-choice set" +#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume" +#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr" +#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute" +#endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 643a5d692df..64856042b65 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -8,375 +8,346 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "statedump.h" - -#include "fd.h" - -#include "afr-inode-read.h" -#include "afr-inode-write.h" -#include "afr-dir-read.h" -#include "afr-dir-write.h" -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_stale_child_up (afr_local_t *local, xlator_t *this) -{ - int i = 0; - afr_private_t *priv = NULL; - int up = -1; - - priv = this->private; - - if (!local->fresh_children) - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; - - afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children); - if (priv->child_count == afr_get_children_count (local->fresh_children, - priv->child_count)) - goto out; +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/statedump.h> - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - if (afr_is_child_present (local->fresh_children, - priv->child_count, i)) - continue; - up = i; - break; - } -out: - return up; -} +#include "afr-transaction.h" -void -afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_fd_fixable(fd_t *fd) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - inode_t *inode = NULL; - int st_child = -1; - char reason[64] = {0}; - - local = frame->local; - sh = &local->self_heal; - inode = local->fd->inode; - - if (!IA_ISREG (inode->ia_type)) - goto out; - - st_child = afr_stale_child_up (local, this); - if (st_child < 0) - goto out; - - sh->do_data_self_heal = _gf_true; - sh->do_metadata_self_heal = _gf_true; - sh->do_gfid_self_heal = _gf_true; - sh->do_missing_entry_self_heal = _gf_true; - - snprintf (reason, sizeof (reason), "stale subvolume %d detected", - st_child); - afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, - reason, NULL, NULL); -out: - return; + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous(fd)) + return _gf_false; + else if (gf_uuid_is_null(fd->inode->gfid)) + return _gf_false; + + return _gf_true; } int -afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +afr_open_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = frame->local; - afr_private_t *priv = NULL; - - priv = this->private; - if (afr_open_only_data_self_heal (priv->data_self_heal)) - afr_perform_data_self_heal (frame, this); - AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd, xdata); - return 0; -} + afr_local_t *local = frame->local; + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, + local->cont.open.fd, xdata); + return 0; +} int -afr_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd, dict_t *xdata) +afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_local_t * local = NULL; - int ret = 0; - int call_count = -1; - int child_index = (long) cookie; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->success_count++; - - ret = afr_child_fd_ctx_set (this, fd, child_index, - local->cont.open.flags); - if (ret) { - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - } + afr_local_t *local = NULL; + int call_count = -1; + int child_index = (long)cookie; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { - STACK_WIND (frame, afr_open_ftruncate_cbk, - this, this->fops->ftruncate, - fd, 0, NULL); - } else { - if (afr_open_only_data_self_heal (priv->data_self_heal)) - afr_perform_data_self_heal (frame, this); - AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd, xdata); - } + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + afr_handle_replies_quorum(frame, this); + if (local->op_ret == -1) { + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL, + NULL); + } else if (fd_ctx->flags & O_TRUNC) { + STACK_WIND(frame, afr_open_ftruncate_cbk, this, + this->fops->ftruncate, fd, 0, NULL); + } else { + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, + local->cont.open.fd, local->xdata_rsp); } + } - return 0; + return 0; } int -afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) +afr_open_continue(call_frame_t *frame, xlator_t *this, int err) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int i = 0; - int ret = -1; - int32_t call_count = 0; - int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - //We can't let truncation to happen outside transaction. - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - - if (flags & (O_CREAT|O_TRUNC)) { - QUORUM_CHECK(open,out); - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - gf_log (this->name, GF_LOG_WARNING, - "failed to open as split brain seen, returning EIO"); - op_errno = EIO; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - call_count = local->call_count; - loc_copy (&local->loc, loc); + local = frame->local; + priv = this->private; - local->cont.open.flags = flags; - - local->fd = fd_ref (fd); + if (err) { + AFR_STACK_UNWIND(open, frame, -1, err, NULL, NULL); + } else { + local->call_count = AFR_COUNT(local->child_up, priv->child_count); + call_count = local->call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - loc, wind_flags, fd, xdata); - - if (!--call_count) - break; - } + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_open_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->open, &local->loc, + (local->cont.open.flags & ~O_TRUNC), + local->cont.open.fd, local->xdata_req); + if (!--call_count) + break; + } } + } + return 0; +} - ret = 0; +int +afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int spb_subvol = 0; + int event_generation = 0; + int ret = 0; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + // We can't let truncation to happen outside transaction. + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_OPEN; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) { + op_errno = ENOMEM; + goto out; + } + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } + + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + local->inode = inode_ref(loc->inode); + loc_copy(&local->loc, loc); + local->fd_ctx = fd_ctx; + fd_ctx->flags = flags; + if (xdata) + local->xdata_req = dict_ref(xdata); + + local->cont.open.flags = flags; + local->cont.open.fd = fd_ref(fd); + + ret = afr_inode_get_readable(frame, local->inode, this, NULL, + &event_generation, AFR_DATA_TRANSACTION); + if ((ret < 0) && + (afr_split_brain_read_subvol_get(local->inode, this, NULL, + &spb_subvol) == 0) && + spb_subvol < 0) { + afr_inode_refresh(frame, this, local->inode, local->inode->gfid, + afr_open_continue); + } else { + afr_open_continue(frame, this, 0); + } + + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); + AFR_STACK_UNWIND(open, frame, -1, op_errno, fd, NULL); - return 0; + return 0; } int -afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - dict_t *xdata) +afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int call_count = 0; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long)cookie; + + priv = this->private; + local = frame->local; + + if (op_ret >= 0) { + gf_msg_debug(this->name, 0, + "fd for %s opened " + "successfully on subvolume %s", + local->loc.path, priv->children[child_index]->name); + } else { + gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno, + AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s", + priv->children[child_index]->name, NULL); + } + + fd_ctx = local->fd_ctx; + + LOCK(&local->fd->lock); + { if (op_ret >= 0) { - gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened " - "successfully on subvolume %s", local->loc.path, - priv->children[child_index]->name); + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; } else { - gf_log (this->name, GF_LOG_ERROR, "Failed to open %s " - "on subvolume %s", local->loc.path, - priv->children[child_index]->name); + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; } + } + UNLOCK(&local->fd->lock); - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context, %p", local->fd); - goto out; - } + call_count = afr_frame_return(frame); + if (call_count == 0) + AFR_STACK_DESTROY(frame); - LOCK (&local->fd->lock); - { - if (op_ret >= 0) { - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - } else { - fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; - } - } - UNLOCK (&local->fd->lock); -out: - call_count = afr_frame_return (frame); - if (call_count == 0) - AFR_STACK_DESTROY (frame); + return 0; +} + +static int +afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) +{ + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int count = 0; + + priv = this->private; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) return 0; + + LOCK(&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && + priv->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + need_open[i] = 1; + count++; + } else { + need_open[i] = 0; + } + } + } + UNLOCK(&fd->lock); + + return count; } void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) +afr_fix_open(fd_t *fd, xlator_t *this) { - afr_private_t *priv = NULL; - int i = 0; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - int ret = -1; - int32_t op_errno = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - priv = this->private; - - if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) - goto out; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - ret = -1; - goto out; - } + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *need_open = NULL; + int call_count = 0; - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } + priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (!afr_is_fd_fixable(fd)) + goto out; - local->loc.inode = inode_ref (fd->inode); - ret = loc_path (&local->loc, NULL); - if (ret < 0) - goto out; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + goto out; - local->fd = fd_ref (fd); - local->call_count = need_open_count; + need_open = alloca0(priv->child_count); - gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", - need_open_count); + call_count = afr_fd_ctx_need_open(fd, this, need_open); + if (!call_count) + goto out; - for (i = 0; i < priv->child_count; i++) { - if (!need_open[i]) - continue; - - if (IA_IFDIR == fd->inode->ia_type) { - gf_log (this->name, GF_LOG_DEBUG, - "opening fd for dir %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, - (void*) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - &local->loc, local->fd, - NULL); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "opening fd for file %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, - (void *)(long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - fd_ctx->flags & (~O_TRUNC), - local->fd, NULL); - } + frame = create_frame(this, this->ctx->pool); + if (!frame) + goto out; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->loc.inode = inode_ref(fd->inode); + ret = loc_path(&local->loc, NULL); + if (ret < 0) + goto out; + + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; + local->call_count = call_count; + + gf_msg_debug(this->name, 0, "need open count: %d", call_count); + + for (i = 0; i < priv->child_count; i++) { + if (!need_open[i]) + continue; + + if (IA_IFDIR == fd->inode->ia_type) { + gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->opendir, &local->loc, + local->fd, NULL); + } else { + gf_msg_debug(this->name, 0, + "opening fd for file %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->open, + &local->loc, fd_ctx->flags & (~O_TRUNC), + local->fd, NULL); } - op_errno = 0; - ret = 0; + + if (!--call_count) + break; + } + + return; out: - if (op_errno) - ret = -1; //For handling ALLOC_OR_GOTO - if (ret && frame) - AFR_STACK_DESTROY (frame); + if (frame) + AFR_STACK_DESTROY(frame); } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c new file mode 100644 index 00000000000..6fc2c75145c --- /dev/null +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -0,0 +1,494 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-messages.h" + +void +afr_pending_read_increment(afr_private_t *priv, int child_index) +{ + if (child_index < 0 || child_index > priv->child_count) + return; + + GF_ATOMIC_INC(priv->pending_reads[child_index]); +} + +void +afr_pending_read_decrement(afr_private_t *priv, int child_index) +{ + if (child_index < 0 || child_index > priv->child_count) + return; + + GF_ATOMIC_DEC(priv->pending_reads[child_index]); +} + +void +afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + afr_pending_read_decrement(priv, local->read_subvol); + local->read_subvol = subvol; + afr_pending_read_increment(priv, subvol); + local->readfn(frame, this, subvol); +} + +int +afr_read_txn_next_subvol(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int subvol = -1; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->readable[i]) { + /* don't even bother trying here. + just mark as attempted and move on. */ + local->read_attempted[i] = 1; + continue; + } + + if (!local->read_attempted[i]) { + subvol = i; + break; + } + } + + /* If no more subvols were available for reading, we leave + @subvol as -1, which is an indication we have run out of + readable subvols. */ + if (subvol != -1) + local->read_attempted[subvol] = 1; + afr_read_txn_wind(frame, this, subvol); + + return 0; +} + +static int +afr_ta_read_txn_done(int ret, call_frame_t *ta_frame, void *opaque) +{ + STACK_DESTROY(ta_frame->root); + return 0; +} + +static int +afr_ta_read_txn(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + int read_subvol = -1; + int query_child = AFR_CHILD_UNKNOWN; + int possible_bad_child = AFR_CHILD_UNKNOWN; + int ret = 0; + int op_errno = ENOMEM; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int **pending = NULL; + loc_t loc = { + 0, + }; + + frame = (call_frame_t *)opaque; + this = frame->this; + local = frame->local; + priv = this->private; + query_child = local->read_txn_query_child; + + if (query_child == AFR_CHILD_ZERO) { + possible_bad_child = AFR_CHILD_ONE; + } else if (query_child == AFR_CHILD_ONE) { + possible_bad_child = AFR_CHILD_ZERO; + } else { + /*read_txn_query_child is AFR_CHILD_UNKNOWN*/ + goto out; + } + + /* Ask the query_child to see if it blames the possibly bad one. */ + xdata_req = dict_new(); + if (!xdata_req) + goto out; + + pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!pending) + goto out; + + ret = afr_set_pending_dict(priv, xdata_req, pending); + if (ret < 0) + goto out; + + if (local->fd) { + ret = syncop_fxattrop(priv->children[query_child], local->fd, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + } else { + ret = syncop_xattrop(priv->children[query_child], &local->loc, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + } + if (ret || !xdata_rsp) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed xattrop for gfid %s on %s", + uuid_utoa(local->inode->gfid), + priv->children[query_child]->name); + op_errno = -ret; + goto out; + } + + if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, + possible_bad_child)) { + read_subvol = query_child; + goto out; + } + dict_unref(xdata_rsp); + xdata_rsp = NULL; + + /* It doesn't. So query thin-arbiter to see if it blames any data brick. */ + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + goto out; + } + flock.l_type = F_WRLCK; /*start and length are already zero. */ + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.", + uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + op_errno = -ret; + goto out; + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + if (ret || !xdata_rsp) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed xattrop on %s.", uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + op_errno = -ret; + goto unlock; + } + + if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) { + read_subvol = query_child; + } else { + gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, + "Failing read for gfid %s since good brick %s is down", + uuid_utoa(local->inode->gfid), + priv->children[possible_bad_child]->name); + op_errno = EIO; + } + +unlock: + flock.l_type = F_UNLCK; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on " + "%s.", + uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + } +out: + if (xdata_req) + dict_unref(xdata_req); + if (xdata_rsp) + dict_unref(xdata_rsp); + if (pending) + afr_matrix_cleanup(pending, priv->child_count); + loc_wipe(&loc); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = op_errno; + } + afr_read_txn_wind(frame, this, read_subvol); + return ret; +} + +void +afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + + local = frame->local; + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + goto out; + } + ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done, + ta_frame, frame); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to launch " + "afr_ta_read_txn synctask for gfid %s.", + uuid_utoa(local->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + STACK_DESTROY(ta_frame->root); + goto out; + } + return; +out: + afr_read_txn_wind(frame, this, -1); +} + +int +afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int read_subvol = -1; + inode_t *inode = NULL; + int ret = -1; + int spb_subvol = -1; + + local = frame->local; + inode = local->inode; + priv = this->private; + + if (err) { + if (!priv->thin_arbiter_count) + goto readfn; + if (err != EINVAL) + goto readfn; + /* We need to query the good bricks and/or thin-arbiter.*/ + afr_ta_read_txn_synctask(frame, this); + return 0; + } + + read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable, + NULL); + if (read_subvol == -1) { + err = EIO; + goto readfn; + } + + if (local->read_attempted[read_subvol]) { + afr_read_txn_next_subvol(frame, this); + return 0; + } + + local->read_attempted[read_subvol] = 1; +readfn: + if (read_subvol == -1) { + ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol); + if ((ret == 0) && spb_subvol >= 0) + read_subvol = spb_subvol; + } + + if (read_subvol == -1) { + AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err); + } + afr_read_txn_wind(frame, this, read_subvol); + + return 0; +} + +int +afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (!local->refreshed) { + local->refreshed = _gf_true; + afr_inode_refresh(frame, this, local->inode, NULL, + afr_read_txn_refresh_done); + } else { + afr_read_txn_next_subvol(frame, this); + } + + return 0; +} + +/* afr_read_txn_wipe: + + clean internal variables in @local in order to make + it possible to call afr_read_txn() multiple times from + the same frame +*/ + +void +afr_read_txn_wipe(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + local->readfn = NULL; + + if (local->inode) + inode_unref(local->inode); + + for (i = 0; i < priv->child_count; i++) { + local->read_attempted[i] = 0; + local->readable[i] = 0; + } +} + +/* + afr_read_txn: + + This is the read transaction function. The way it works: + + - Determine read-subvolume from inode ctx. + + - If read-subvolume's generation was stale, refresh ctx once by + calling afr_inode_refresh() + + Else make an attempt to read on read-subvolume. + + - If attempted read on read-subvolume fails, refresh ctx once + by calling afr_inode_refresh() + + - After ctx refresh, query read-subvolume freshly and attempt + read once. + + - If read fails, try every other readable[] subvolume before + finally giving up. readable[] elements are set by afr_inode_refresh() + based on dirty and pending flags. + + - If file is in split brain in the backend, generation will be + kept 0 by afr_inode_refresh() and readable[] will be set 0 for + all elements. Therefore reads always fail. +*/ + +int +afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *data = NULL; + unsigned char *metadata = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + data = alloca0(priv->child_count); + metadata = alloca0(priv->child_count); + + afr_read_txn_wipe(frame, this); + + local->readfn = readfn; + local->inode = inode_ref(inode); + local->is_read_txn = _gf_true; + local->transaction.type = type; + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_quorum_errno(priv); + goto read; + } + + if (!afr_is_consistent_io_possible(local, priv, &local->op_errno)) { + local->op_ret = -1; + goto read; + } + + if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { + local->op_ret = -1; + local->op_errno = -afr_quorum_errno(priv); + goto read; + } + + if (priv->thin_arbiter_count && + AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + if (local->child_up[0]) { + local->read_txn_query_child = AFR_CHILD_ZERO; + } else if (local->child_up[1]) { + local->read_txn_query_child = AFR_CHILD_ONE; + } + afr_ta_read_txn_synctask(frame, this); + return 0; + } + + ret = afr_inode_read_subvol_get(inode, this, data, metadata, + &event_generation); + if (ret == -1) + /* very first transaction on this inode */ + goto refresh; + AFR_INTERSECT(local->readable, data, metadata, priv->child_count); + + gf_msg_debug(this->name, 0, + "%s: generation now vs cached: %d, " + "%d", + uuid_utoa(inode->gfid), local->event_generation, + event_generation); + if (afr_is_inode_refresh_reqd(inode, this, local->event_generation, + event_generation)) + /* servers have disconnected / reconnected, and possibly + rebooted, very likely changing the state of freshness + of copies */ + goto refresh; + + read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable, + NULL); + + if (read_subvol < 0 || read_subvol > priv->child_count) { + gf_msg_debug(this->name, 0, + "Unreadable subvolume %d found " + "with event generation %d for gfid %s.", + read_subvol, event_generation, uuid_utoa(inode->gfid)); + goto refresh; + } + + if (!local->child_up[read_subvol]) { + /* should never happen, just in case */ + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR, + "subvolume %d is the " + "read subvolume in this generation, but is not up", + read_subvol); + goto refresh; + } + + local->read_attempted[read_subvol] = 1; + +read: + afr_read_txn_wind(frame, this, read_subvol); + + return 0; + +refresh: + afr_inode_refresh(frame, this, inode, NULL, afr_read_txn_refresh_done); + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c deleted file mode 100644 index 83846f152d2..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ /dev/null @@ -1,837 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#include <openssl/md5.h> -#include "glusterfs.h" -#include "afr.h" -#include "xlator.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -/* - This file contains the various self-heal algorithms -*/ - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, - gf_boolean_t is_first_call, call_frame_t *old_loop_frame); -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, - int32_t op_ret, int32_t op_errno); -static int -sh_destroy_frame (call_frame_t *frame, xlator_t *this) -{ - if (!frame) - goto out; - - AFR_STACK_DESTROY (frame); -out: - return 0; -} - -static void -sh_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - GF_FREE (sh_priv); -} - -static int -sh_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ - int writes = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (write_needed[i]) - writes++; - } - - return writes; -} - - -static int -sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, - call_frame_t *last_loop_frame) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - int32_t total_blocks = 0; - int32_t diff_blocks = 0; - - local = sh_frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - if (sh_priv) { - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; - } - - sh_private_cleanup (sh_frame, this); - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - GF_ASSERT (!last_loop_frame); - //loop_finish should have happened and the old_loop should be NULL - gf_log (this->name, GF_LOG_DEBUG, - "self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (sh_frame, this); - } else { - GF_ASSERT (last_loop_frame); - if (diff_blocks == total_blocks) { - gf_log (this->name, GF_LOG_DEBUG, "full self-heal " - "completed on %s",local->loc.path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "diff self-heal on %s: completed. " - "(%d blocks of %d were different (%.2f%%))", - local->loc.path, diff_blocks, total_blocks, - ((diff_blocks * 1.0)/total_blocks) * 100); - } - - sh->old_loop_frame = last_loop_frame; - local->self_heal.algo_completion_cbk (sh_frame, this); - } - - return 0; -} - -int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) -{ - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - if (!loop_frame) - goto out; - - loop_local = loop_frame->local; - if (loop_local) { - loop_sh = &loop_local->self_heal; - } - - if (loop_sh && loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, this->name, - sh_destroy_frame); - } else { - sh_destroy_frame (loop_frame, this); - } -out: - return 0; -} - -static int -sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) -{ - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_loop_finish (loop_sh->old_loop_frame, this); - loop_sh->old_loop_frame = NULL; - - gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 - " %"PRIu64, loop_sh->offset, loop_sh->block_size); - loop_sh->data_lock_held = _gf_true; - loop_sh->sh_data_algo_start (loop_frame, this); - return 0; -} - -static int -sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) -{ - call_frame_t *sh_frame = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - sh_frame = loop_sh->sh_frame; - - gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 - " %"PRIu64, loop_sh->offset, loop_sh->block_size); - sh_loop_finish (loop_sh->old_loop_frame, this); - loop_sh->old_loop_frame = NULL; - sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); - return 0; -} - -static int -sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, - call_frame_t *old_loop_frame, call_frame_t **loop_frame) -{ - call_frame_t *new_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *new_loop_local = NULL; - afr_self_heal_t *new_loop_sh = NULL; - afr_private_t *priv = NULL; - - GF_ASSERT (sh_frame); - GF_ASSERT (loop_frame); - - *loop_frame = NULL; - local = sh_frame->local; - sh = &local->self_heal; - priv = this->private; - - new_loop_frame = copy_frame (sh_frame); - if (!new_loop_frame) - goto out; - //We want the frame to have same lk_owner as sh_frame - //so that locks translator allows conflicting locks - new_loop_local = afr_self_heal_local_init (local, this); - if (!new_loop_local) - goto out; - new_loop_frame->local = new_loop_local; - - new_loop_sh = &new_loop_local->self_heal; - new_loop_sh->sources = memdup (sh->sources, - priv->child_count * sizeof (*sh->sources)); - if (!new_loop_sh->sources) - goto out; - new_loop_sh->write_needed = GF_CALLOC (priv->child_count, - sizeof (*new_loop_sh->write_needed), - gf_afr_mt_char); - if (!new_loop_sh->write_needed) - goto out; - new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, - gf_afr_mt_uint8_t); - if (!new_loop_sh->checksum) - goto out; - new_loop_sh->inode = inode_ref (sh->inode); - new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; - new_loop_sh->source = sh->source; - new_loop_sh->active_sinks = sh->active_sinks; - new_loop_sh->healing_fd = fd_ref (sh->healing_fd); - new_loop_sh->file_has_holes = sh->file_has_holes; - new_loop_sh->old_loop_frame = old_loop_frame; - new_loop_sh->sh_frame = sh_frame; - *loop_frame = new_loop_frame; - return 0; -out: - sh_destroy_frame (new_loop_frame, this); - return -ENOMEM; -} - -static int -sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, - call_frame_t *old_loop_frame) -{ - call_frame_t *new_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *new_loop_local = NULL; - afr_self_heal_t *new_loop_sh = NULL; - int ret = 0; - - GF_ASSERT (sh_frame); - - local = sh_frame->local; - sh = &local->self_heal; - - ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, - &new_loop_frame); - if (ret) - goto out; - new_loop_local = new_loop_frame->local; - new_loop_sh = &new_loop_local->self_heal; - new_loop_sh->offset = offset; - new_loop_sh->block_size = sh->block_size; - afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); - return 0; -out: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - if (old_loop_frame) - sh_loop_finish (old_loop_frame, this); - sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); - return 0; -} - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, - gf_boolean_t is_first_call, call_frame_t *old_loop_frame) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - int loop = 0; - off_t offset = 0; - afr_private_t *priv = NULL; - - priv = this->private; - local = sh_frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - LOCK (&sh_priv->lock); - { - if (!is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh->block_size; - while ((!sh->eof_reached) && - (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - sh_priv->offset += block_size; - sh_priv->loops_running++; - - if (!is_first_call) - break; - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - if (0 == loop) { - //loop finish does unlock, but the erasing of the pending - //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && - !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - goto driver_done; - if (old_loop_frame) { - sh_loop_finish (old_loop_frame, this); - old_loop_frame = NULL; - } - } - - //If we have more loops to form we should finish previous loop after - //the next loop lock - while (loop--) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - // op failed in other loop, stop spawning more loops - if (old_loop_frame) { - sh_loop_finish (old_loop_frame, this); - old_loop_frame = NULL; - } - sh_loop_driver (sh_frame, this, _gf_false, NULL); - } else { - gf_log (this->name, GF_LOG_TRACE, "spawning a loop " - "for offset %"PRId64, offset); - - sh_loop_start (sh_frame, this, offset, old_loop_frame); - old_loop_frame = NULL; - offset += block_size; - } - } - -driver_done: - if (is_driver_done) { - sh_loop_driver_done (sh_frame, this, old_loop_frame); - } - return 0; -} - -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - if (loop_frame) { - loop_local = loop_frame->local; - if (loop_local) - loop_sh = &loop_local->self_heal; - if (loop_sh) - gf_log (this->name, GF_LOG_TRACE, "loop for offset " - "%"PRId64" returned", loop_sh->offset); - } - - if (op_ret == -1) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - if (loop_frame) { - sh_loop_finish (loop_frame, this); - loop_frame = NULL; - } - } - - sh_loop_driver (sh_frame, this, _gf_false, loop_frame); - - return 0; -} - -static int -sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - child_index = (long) cookie; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, loop_sh->offset); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (loop_sh, op_errno); - } else if (op_ret < loop_local->cont.writev.vector->iov_len) { - gf_log (this->name, GF_LOG_ERROR, - "incomplete write to %s on subvolume %s " - "(expected %lu, returned %d)", sh_local->loc.path, - priv->children[child_index]->name, - loop_local->cont.writev.vector->iov_len, op_ret); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - - call_count = afr_frame_return (loop_frame); - - if (call_count == 0) { - iobref_unref(loop_local->cont.writev.iobref); - - sh_loop_return (sh_frame, this, loop_frame, - loop_sh->op_ret, loop_sh->op_errno); - } - - return 0; -} - -static void -sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, - afr_private_t *priv) -{ - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int i = 0; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - if (!strcmp (sh->algo->name, "diff")) - return; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - /* full self-heal guarantees there exists atleast 1 file with size 0 - * That means for other files we can preserve holes that come after - * its size before 'trim' - */ - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->write_needed[i] && - ((loop_sh->offset + 1) > sh->buf[i].ia_size)) - loop_sh->write_needed[i] = 0; - } -} - -static int -sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - call_frame_t *sh_frame = NULL; - int i = 0; - int call_count = 0; - afr_local_t * sh_local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, loop_local->loc.path, loop_sh->offset); - - if (op_ret <= 0) { - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - gf_log (this->name, GF_LOG_ERROR, "read failed on %d " - "for %s reason :%s", sh->source, - sh_local->loc.path, strerror (errno)); - } else { - sh->eof_reached = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", - sh_local->loc.path); - } - sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); - goto out; - } - - if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) - sh_prune_writes_needed (sh_frame, loop_frame, priv); - - call_count = sh_number_of_writes_needed (loop_sh->write_needed, - priv->child_count); - if (call_count == 0) { - sh_loop_return (sh_frame, this, loop_frame, 0, 0); - goto out; - } - - loop_local->call_count = call_count; - - /* - * We only really need the request size at the moment, but the buffer - * is required if we want to issue a retry in the event of a short write. - * Therefore, we duplicate the vector and ref the iobref here... - */ - loop_local->cont.writev.vector = iov_dup(vector, count); - loop_local->cont.writev.iobref = iobref_ref(iobref); - - for (i = 0; i < priv->child_count; i++) { - if (!loop_sh->write_needed[i]) - continue; - STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - loop_sh->healing_fd, vector, count, - loop_sh->offset, 0, iobref, NULL); - - if (!--call_count) - break; - } - -out: - return 0; -} - - -static int -sh_loop_read (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, - (void *) (long) loop_sh->source, - priv->children[loop_sh->source], - priv->children[loop_sh->source]->fops->readv, - loop_sh->healing_fd, loop_sh->block_size, - loop_sh->offset, 0, NULL); - - return 0; -} - - -static int -sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - int child_index = 0; - int call_count = 0; - int i = 0; - int write_needed = 0; - - priv = this->private; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - sh_priv = sh->private; - - child_index = (long) cookie; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "checksum on %s failed on subvolume %s (%s)", - sh_local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, - strong_checksum, MD5_DIGEST_LENGTH); - } - - call_count = afr_frame_return (loop_frame); - - if (call_count == 0) { - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), - loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), - MD5_DIGEST_LENGTH)) { - /* - Checksums differ, so this block - must be written to this sink - */ - - gf_log (this->name, GF_LOG_DEBUG, - "checksum on subvolume %s at offset %" - PRId64" differs from that on source", - priv->children[i]->name, loop_sh->offset); - - write_needed = loop_sh->write_needed[i] = 1; - } - } - - LOCK (&sh_priv->lock); - { - sh_priv->total_blocks++; - if (write_needed) - sh_priv->diff_blocks++; - } - UNLOCK (&sh_priv->lock); - - if (write_needed && - !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - sh_loop_read (loop_frame, this); - } else { - sh_loop_return (sh_frame, this, loop_frame, - op_ret, op_errno); - } - } - - return 0; -} - -static int -sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int call_count = 0; - int i = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - call_count = loop_sh->active_sinks + 1; /* sinks and source */ - - loop_local->call_count = call_count; - - STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, - (void *) (long) loop_sh->source, - priv->children[loop_sh->source], - priv->children[loop_sh->source]->fops->rchecksum, - loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size, NULL); - - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->sources[i] || !loop_local->child_up[i]) - continue; - - STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rchecksum, - loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size, NULL); - - if (!--call_count) - break; - } - - return 0; -} - -static int -sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int i = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->sources[i] || !loop_local->child_up[i]) - continue; - loop_sh->write_needed[i] = 1; - } - sh_loop_read (loop_frame, this); - return 0; -} - -afr_sh_algo_private_t* -afr_sh_priv_init () -{ - afr_sh_algo_private_t *sh_priv = NULL; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - if (!sh_priv) - goto out; - - LOCK_INIT (&sh_priv->lock); -out: - return sh_priv; -} - -int -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, - unsigned int child_count) -{ - afr_local_t *dst_local = NULL; - afr_self_heal_t *dst_sh = NULL; - afr_local_t *src_local = NULL; - afr_self_heal_t *src_sh = NULL; - int ret = -1; - - dst_local = dst->local; - dst_sh = &dst_local->self_heal; - src_local = src->local; - src_sh = &src_local->self_heal; - GF_ASSERT (src_sh->data_lock_held); - GF_ASSERT (!dst_sh->data_lock_held); - ret = afr_lk_transfer_datalock (dst, src, dom, child_count); - if (ret) - return ret; - src_sh->data_lock_held = _gf_false; - dst_sh->data_lock_held = _gf_true; - return 0; -} - -int -afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, - afr_sh_algo_fn sh_data_algo_start) -{ - call_frame_t *first_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = 0; - afr_private_t *priv = NULL; - - local = sh_frame->local; - sh = &local->self_heal; - priv = this->private; - - sh->sh_data_algo_start = sh_data_algo_start; - local->call_count = 0; - ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); - if (ret) - goto out; - ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, - priv->child_count); - if (ret) - goto out; - sh->private = afr_sh_priv_init (); - if (!sh->private) { - ret = -1; - goto out; - } - sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); - ret = 0; -out: - if (ret) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - sh_loop_driver_done (sh_frame, this, NULL); - } - return 0; -} - -int -afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) -{ - afr_sh_start_loops (sh_frame, this, sh_diff_checksum); - return 0; -} - -int -afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) -{ - afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); - return 0; -} - -struct afr_sh_algorithm afr_self_heal_algorithms[] = { - {.name = "full", .fn = afr_sh_algo_full}, - {.name = "diff", .fn = afr_sh_algo_diff}, - {0, 0}, -}; diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h deleted file mode 100644 index 6b20789b1bb..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_ALGORITHM_H__ -#define __AFR_SELF_HEAL_ALGORITHM_H__ - -typedef int (*afr_sh_algo_fn) (call_frame_t *frame, - xlator_t *this); - -struct afr_sh_algorithm { - const char *name; - afr_sh_algo_fn fn; -}; - -extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; -typedef struct { - gf_lock_t lock; - unsigned int loops_running; - off_t offset; - - int32_t total_blocks; - int32_t diff_blocks; -} afr_sh_algo_private_t; - -#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index ef92b420551..a580a1584cc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,2805 +8,2927 @@ cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" - #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" -#include "pump.h" - -#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ - do { \ - if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ - off += snprintf (msg + off, sizeof (msg) - off, \ - " "sh_str" self heal %s,", \ - get_sh_completion_status (status));\ - print_log = 1; \ - } \ - } while (0) - -#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ - do { \ - if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ - AFR_SELF_HEAL_FAILED == status) { \ - off += snprintf (msg + off, sizeof (msg) - off, \ - " "sh_str" self heal %s,", \ - get_sh_completion_status (status));\ - print_log = 1; \ - } \ - } while (0) - +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include "afr-messages.h" +#include <glusterfs/events.h> void -afr_sh_reset (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; +afr_heal_synctask(xlator_t *this, afr_local_t *local); - local = frame->local; - sh = &local->self_heal; - priv = this->private; +int +afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name, + inode_t *inode, struct afr_reply *replies, int source, + unsigned char *sources, void *gfid, int *gfid_idx) +{ + afr_private_t *priv = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + unsigned char *wind_on = NULL; + ia_type_t ia_type = IA_INVAL; + dict_t *xdata = NULL; + loc_t loc = { + 0, + }; + int ret = 0; + int i = 0; + + priv = this->private; + wind_on = alloca0(priv->child_count); + if (source >= 0 && replies[source].valid && replies[source].op_ret == 0) + ia_type = replies[source].poststat.ia_type; + + if (ia_type != IA_INVAL) + goto heal; + + /* If ia_type is still invalid, it means either + * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain + * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks + * are sources) and the 'source' we selected earlier might be the one where + * the file is not actually present. + * + * In both cases, let us pick a brick with a successful reply and use its + * ia_type. + * */ + for (i = 0; i < priv->child_count; i++) { + if (source == -1) { + /* case (a) above. */ + if (replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { + ia_type = replies[i].poststat.ia_type; + break; + } + } else { + /* case (b) above. */ + if (i == source) + continue; + if (sources[i] && replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { + ia_type = replies[i].poststat.ia_type; + break; + } + } + } - memset (sh->child_errno, 0, - sizeof (*sh->child_errno) * priv->child_count); - memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); - memset (sh->parentbufs, 0, - sizeof (*sh->parentbufs) * priv->child_count); - memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); - memset (sh->locked_nodes, 0, - sizeof (*sh->locked_nodes) * priv->child_count); - sh->active_sinks = 0; +heal: + /* gfid heal on those subvolumes that do not have gfid associated + * with the inode and update those replies. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; - afr_reset_xattr (sh->xattr, priv->child_count); -} + if (gf_uuid_is_null(gfid) && + !gf_uuid_is_null(replies[i].poststat.ia_gfid) && + replies[i].poststat.ia_type == ia_type) + gfid = replies[i].poststat.ia_gfid; -//Intersection[child]=1 if child is part of intersection -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, - int *intersection, unsigned int child_count) -{ - int i = 0; + if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) || + replies[i].poststat.ia_type != ia_type) + continue; + + wind_on[i] = 1; + } - memset (intersection, 0, sizeof (*intersection) * child_count); - for (i = 0; i < child_count; i++) { - intersection[i] = afr_is_child_present (set1, child_count, i) - && afr_is_child_present (set2, child_count, - i); + if (AFR_COUNT(wind_on, priv->child_count) == 0) + return 0; + + xdata = dict_new(); + if (!xdata) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true); + if (ret) { + ret = -ENOMEM; + goto out; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + local = frame->local; + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref(inode); + + AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + afr_reply_wipe(&replies[i]); + afr_reply_copy(&replies[i], &local->replies[i]); + } + if (gfid_idx && (*gfid_idx == -1)) { + /*Pick a brick where the gifd heal was successful.*/ + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if (replies[i].valid && replies[i].op_ret == 0 && + !gf_uuid_is_null(replies[i].poststat.ia_gfid)) { + *gfid_idx = i; + break; + } } -} + } +out: + if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) { + ret = -afr_final_errno(local, priv); + } + loc_wipe(&loc); + if (frame) + AFR_STACK_DESTROY(frame); + if (xdata) + dict_unref(xdata); -/** - * select_source - select a source and return it - */ + return ret; +} int -afr_sh_select_source (int sources[], int child_count) +afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies, + char *src_brick) { - int i = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; + int i = 0; + afr_private_t *priv = NULL; - return -1; + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (strcmp(priv->children[i]->name, src_brick) == 0) + return i; + } + return -1; } -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies, + int child_count) { - int i = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; + int j = 0; + int i = 0; + int votes; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { - sh->success[i] = 1; - } + votes = 1; + for (j = i + 1; j < child_count; j++) { + if ((!gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[j].poststat.ia_gfid))) + votes++; + if (votes > child_count / 2) + return i; } - sh->active_sinks = active_sinks; + } + + return -1; } int -afr_sh_source_count (int sources[], int child_count) +afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies, + int child_count) { - int i = 0; - int nsource = 0; + int i = 0; + int src = -1; + uint64_t size = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (size < replies[i].poststat.ia_size) { + src = i; + size = replies[i].poststat.ia_size; + } else if (replies[i].poststat.ia_size == size) { + src = -1; + } + } + return src; } -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) -{ - sh->op_ret = -1; - sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, - _gf_false); +int +afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies, + int child_count) +{ + int i = 0; + int src = -1; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + if ((mtime < replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { + src = i; + mtime = replies[i].poststat.ia_mtime; + mtime_nsec = replies[i].poststat.ia_mtime_nsec; + } else if ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) { + src = -1; + } + } + return src; } -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) -{ - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); - - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); +int +afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, const char *bname, + int src_idx, int child_idx, + unsigned char *locked_on, int *src, dict_t *xdata) +{ + afr_private_t *priv = NULL; + char g1[64] = { + 0, + }; + char g2[64] = { + 0, + }; + int up_count = 0; + int heal_op = -1; + int ret = -1; + char *src_brick = NULL; + + *src = -1; + priv = this->private; + up_count = AFR_COUNT(locked_on, priv->child_count); + if (up_count != priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "All the bricks should be up to resolve the gfid split " + "barin"); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SALL_BRICKS_UP_TO_RESOLVE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "Error setting" + " gfid-heal-msg dict"); + } + goto out; + } + + if (xdata) { + ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op); + if (ret) + goto fav_child; + } else { + goto fav_child; + } + + switch (heal_op) { + case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + *src = afr_gfid_sbrain_source_from_bigger_file(replies, + priv->child_count); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_BIGGER_FILE); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SNO_BIGGER_FILE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + " setting gfid-heal-msg dict"); } - sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); - } - - GF_FREE (buf); -} - -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) -{ - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - int child_count = priv->child_count; - char *matrix_begin = "[ [ "; - char *matrix_end = "] ]"; - char *seperator = "] [ "; - int pending_entry_strlen = 12; //Including space after entry - int matrix_begin_strlen = 0; - int matrix_end_strlen = 0; - int seperator_strlen = 0; - int string_length = 0; - char *msg = "- Pending matrix: "; - - /* - * for a list of lists of [ [ a b ] [ c d ] ] - * */ - - matrix_begin_strlen = strlen (matrix_begin); - matrix_end_strlen = strlen (matrix_end); - seperator_strlen = strlen (seperator); - string_length = matrix_begin_strlen + matrix_end_strlen - + (child_count -1) * seperator_strlen - + (child_count * child_count * pending_entry_strlen); - - buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); - if (!buf) - goto out; - - ptr = buf; - ptr += sprintf (ptr, "%s", msg); - ptr += sprintf (ptr, "%s", matrix_begin); - for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + break; + + case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: + *src = afr_gfid_sbrain_source_from_latest_mtime(replies, + priv->child_count); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_DIFF_IN_MTIME); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SNO_DIFF_IN_MTIME); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + "setting gfid-heal-msg dict"); } - if (i < priv->child_count -1) - ptr += sprintf (ptr, "%s", seperator); - } + } + break; + + case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str_sizen(xdata, "child-name", &src_brick); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Error getting the source " + "brick"); + break; + } + *src = afr_gfid_sbrain_source_from_src_brick(this, replies, + src_brick); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SERROR_GETTING_SRC_BRICK); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SERROR_GETTING_SRC_BRICK); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + " setting gfid-heal-msg dict"); + } + } + break; - ptr += sprintf (ptr, "%s", matrix_end); + default: + break; + } + goto out; + +fav_child: + switch (priv->fav_child_policy) { + case AFR_FAV_CHILD_BY_SIZE: + *src = afr_sh_fav_by_size(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_MTIME: + *src = afr_sh_fav_by_mtime(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_CTIME: + *src = afr_sh_fav_by_ctime(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_MAJORITY: + if (priv->child_count != 2) + *src = afr_selfheal_gfid_mismatch_by_majority( + replies, priv->child_count); + else + *src = -1; + + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "No majority to resolve " + "gfid split brain"); + } + break; + default: + break; + } out: - return buf; + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and" + " %s on %s.", + uuid_utoa(pargfid), bname, + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), + priv->children[child_idx]->name, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2), + priv->children[src_idx]->name); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=gfid;file=" + "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;" + "child-%d=%s;gfid-%d=%s", + this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid), + bname, child_idx, priv->children[child_idx]->name, child_idx, + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx, + priv->children[src_idx]->name, src_idx, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2)); + return -1; + } + return 0; } -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc) +int +afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - char *buf = NULL; - char *free_ptr = NULL; + afr_local_t *local = NULL; - buf = afr_get_pending_matrix_str (pending_matrix, this); - if (buf) - free_ptr = buf; - else - buf = ""; + local = frame->local; + local->op_ret = op_ret; + local->op_errno = op_errno; + syncbarrier_wake(&local->barrier); - gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" - " (possible split-brain). Please delete the file from all but " - "the preferred subvolume.%s", loc, buf); - GF_FREE (free_ptr); - return; + return 0; } - -void -afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) +int +afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr, dict_t *xdata) { - int i = 0; - int j = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = { + 0, + }; + int ret = 0; - GF_ASSERT (pending_matrix); + priv = this->private; + local = frame->local; - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } -} + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); -void -afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, - unsigned char *ignorant_subvols, - size_t child_count) -{ - int i = 0; - int j = 0; - - GF_ASSERT (pending_matrix); - GF_ASSERT (ignorant_subvols); - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } -} + local->op_ret = 0; -int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - unsigned char *ignorant_subvols, - dict_t *xattr[], afr_transaction_type type, - size_t child_count) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = -1; - int i = 0; - int j = 0; - int k = 0; - - afr_init_pending_matrix (pending_matrix, child_count); - - for (i = 0; i < child_count; i++) { - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], pending_key[j], - &pending_raw); - - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ - - if (ignorant_subvols) - ignorant_subvols[i] = 1; - continue; - } - - memcpy (pending, pending_raw, sizeof(pending)); - k = afr_index_for_transaction_type (type); - - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } + STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, xdata); - return ret; -} + syncbarrier_wait(&local->barrier, 1); + if (local->op_ret < 0) + ret = -local->op_errno; -typedef enum { - AFR_NODE_INVALID, - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE, -} afr_node_type; + loc_wipe(&loc); + local->op_ret = 0; -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; + return ret; +} +int +afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + int stale_count = 0; + + for (i = 0; i < priv->child_count; i++) { + tmp_errno = replies[i].op_errno; + if (tmp_errno == ENOENT || tmp_errno == ESTALE) { + op_errno = afr_higher_errno(op_errno, tmp_errno); + stale_count++; + } + } + if (stale_count != priv->child_count) + return -ENOTCONN; + else + return -op_errno; +} -static int -afr_sh_is_innocent (int32_t *array, int child_count) +int +afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - int i = 0; - int ret = 1; /* innocent until proven guilty */ + int i = (long)cookie; + afr_local_t *local = NULL; - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + local = frame->local; - return ret; -} + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (pre) + local->replies[i].prestat = *pre; + if (post) + local->replies[i].poststat = *post; + if (xdata) + local->replies[i].xdata = dict_ref(xdata); + syncbarrier_wake(&local->barrier); -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ - return array[i]; /* fool if accuses itself */ + return 0; } +int +afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *replies) +{ + loc_t loc = { + 0, + }; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc, + &replies[source].poststat, + (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME), + NULL); + + loc_wipe(&loc); + + return 0; +} + +dict_t * +afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl, + afr_transaction_type type, int *output_dirty, + int **output_matrix, int subvol, + int **full_heal_mtx_out) +{ + int j = 0; + int idx = 0; + int d_idx = 0; + int ret = 0; + int *raw = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + idx = afr_index_for_transaction_type(type); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + xattr = dict_new(); + if (!xattr) + return NULL; + + /* clear dirty */ + raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + + raw[idx] = hton32(output_dirty[subvol]); + ret = dict_set_bin(xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + GF_FREE(raw); + goto err; + } + + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + + raw[idx] = hton32(output_matrix[subvol][j]); + if (is_full_crawl) + raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]); + + ret = dict_set_bin(xattr, priv->pending_key[j], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + GF_FREE(raw); + goto err; + } + } -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) -{ - return !array[i]; /* wise if does not accuse itself */ + return xattr; +err: + if (xattr) + dict_unref(xattr); + return NULL; } +int +afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, struct afr_reply *replies, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int **full_heal_mtx_in = NULL; + int **full_heal_mtx_out = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + + priv = this->private; + local = frame->local; + + pending = alloca0(priv->child_count); + + input_dirty = alloca0(priv->child_count * sizeof(int)); + input_matrix = ALLOC_MATRIX(priv->child_count, int); + full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int); + full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int); + output_dirty = alloca0(priv->child_count * sizeof(int)); + output_matrix = ALLOC_MATRIX(priv->child_count, int); + + xdata = dict_new(); + if (!xdata) + return -1; -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; + afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix); - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } - } + if (local->need_full_crawl) + afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL, + full_heal_mtx_in); - return ret; -} + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; - -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) -{ - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) { + output_matrix[i][j] = 1; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = 1; + } else if (locked_on[j]) { + output_matrix[i][j] = -input_matrix[i][j]; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j]; + } } + } - return ret; -} + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + if (undid_pending[i]) + /* We already unset the pending xattrs in + * _afr_fav_child_reset_sink_xattrs(). */ + continue; -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occurred. - */ + xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type, + output_dirty, output_matrix, i, + full_heal_mtx_out); + if (!xattr) { + continue; + } -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) -{ - int i = 0; - int j = 0; + if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) { + if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1)) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED, + "Failed to set" + " dict value for %s", + GF_XATTROP_PURGE_INDEX); + } - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; + afr_selfheal_post_op(frame, this, inode, i, xattr, xdata); + dict_unref(xattr); + } - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { + if (xdata) + dict_unref(xdata); - characters[i].wisdom = 0; - } - } - } - } + return 0; } +void +afr_reply_copy(struct afr_reply *dst, struct afr_reply *src) +{ + dict_t *xdata = NULL; + + dst->valid = src->valid; + dst->op_ret = src->op_ret; + dst->op_errno = src->op_errno; + dst->prestat = src->prestat; + dst->poststat = src->poststat; + dst->preparent = src->preparent; + dst->postparent = src->postparent; + dst->preparent2 = src->preparent2; + dst->postparent2 = src->postparent2; + if (src->xdata) + xdata = dict_ref(src->xdata); + else + xdata = NULL; + if (dst->xdata) + dict_unref(dst->xdata); + dst->xdata = xdata; + if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum", + _gf_false) == _gf_true) { + memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH); + } else { + memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH); + } + dst->fips_mode_rchecksum = src->fips_mode_rchecksum; +} -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) +void +afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count) { - int i = 0; - int ret = 1; + int i = 0; - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { - - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + if (dst == src) + return; - return ret; + for (i = 0; i < count; i++) { + afr_reply_copy(&dst[i], &src[i]); + } } - -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) +int +afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx, + dict_t *xdata) { - int nsources = 0; - int i = 0; + void *pending_raw = NULL; + int pending[3] = { + 0, + }; - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } + if (!dirty) + return 0; - return nsources; -} + if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw)) + return -1; -static void -afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int j = 0; - int witness = 0; - - GF_ASSERT (witnesses); - GF_ASSERT (pending_matrix); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - witness = 0; - for (j = 0; j < child_count; j++) { - if (i == j) - continue; - witness += pending_matrix[i][j]; - } - witnesses[i] = witness; - } + if (!pending_raw) + return -1; + + memcpy(pending, pending_raw, sizeof(pending)); + + dirty[subvol] = ntoh32(pending[idx]); + + return 0; } -static int32_t -afr_find_biggest_witness_among_fools (int32_t *witnesses, - afr_node_character *characters, - int32_t child_count) +int +afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx, + dict_t *xdata) { - int i = 0; - int biggest_witness = -1; - int biggest_witness_idx = -1; - int biggest_witness_cnt = -1; + int i = 0; + void *pending_raw = NULL; + int pending[3] = { + 0, + }; + afr_private_t *priv = NULL; - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); + priv = this->private; - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; + if (!matrix) + return 0; - if (biggest_witness < witnesses[i]) { - biggest_witness = witnesses[i]; - biggest_witness_idx = i; - biggest_witness_cnt = 1; - continue; - } + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; - if (biggest_witness == witnesses[i]) - biggest_witness_cnt++; - } + if (!pending_raw) + continue; - if (biggest_witness_cnt != 1) - return -1; + memcpy(pending, pending_raw, sizeof(pending)); - return biggest_witness_idx; + matrix[subvol][i] = ntoh32(pending[idx]); + } + + return 0; } int -afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, - afr_node_character *characters, - int32_t child_count, int32_t witness) +afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - int i = 0; - int nsources = 0; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; - GF_ASSERT (sources); - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); + idx = afr_index_for_transaction_type(type); - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; + priv = this->private; - if (witness == witnesses[i]) { - sources[i] = 1; - nsources++; - } - } - return nsources; -} + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + if (!replies[i].xdata) + continue; -int -afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) -{ - if (idx >= 0 && idx < child_count) { - sources[idx] = 1; - return 1; - } - return 0; -} + xdata = replies[i].xdata; + afr_selfheal_fill_dirty(this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix(this, matrix, i, idx, xdata); + } -static int -afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, - int child_count) -{ - int idx = -1; - int i = -1; - int child = -1; - uint64_t max_size = 0; - uint64_t min_size = 0; - int num_children = 0; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - - child = success_children[i]; - if (bufs[child].ia_size > max_size) { - max_size = bufs[child].ia_size; - idx = child; - } - - if ((num_children == 0) || (bufs[child].ia_size < min_size)) { - min_size = bufs[child].ia_size; - } + return 0; +} - num_children++; - } +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can happen if data was directly modified in the backend or for snapshots + */ +void +afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; - /* If sizes are same for all of them, finding sources will have to - * happen with pending changelog. So return -1 - */ - if ((num_children > 1) && (min_size == max_size)) - return -1; - return idx; + /* Find source with biggest file size */ + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!replies[i].valid || replies[i].op_ret != 0) { + sources[i] = 0; + continue; + } + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + } + } + + /* Mark sources with less size as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size > replies[i].poststat.ia_size) + sources[i] = 0; + } } +void +afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!replies[i].valid || replies[i].op_ret != 0) { + sources[i] = 0; + continue; + } + if ((mtime < replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { + mtime = replies[i].poststat.ia_mtime; + mtime_nsec = replies[i].poststat.ia_mtime_nsec; + } + } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if ((mtime > replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) { + sources[i] = 0; + } + } +} -static int -afr_find_newest_file (struct iatt *bufs, int32_t *success_children, - int child_count) +void +afr_mark_active_sinks(xlator_t *this, unsigned char *sources, + unsigned char *locked_on, unsigned char *sinks) { - int idx = -1; - int i = -1; - int child = -1; - uint64_t max_ctime = 0; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; + int i = 0; + afr_private_t *priv = NULL; - child = success_children[i]; - if (bufs[child].ia_ctime > max_ctime) { - max_ctime = bufs[child].ia_ctime; - idx = child; - } - } + priv = this->private; - return idx; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] && locked_on[i]) + sinks[i] = 1; + else + sinks[i] = 0; + } } +gf_boolean_t +afr_dict_contains_heal_op(call_frame_t *frame) +{ + afr_local_t *local = NULL; + dict_t *xdata_req = NULL; + int ret = 0; + int heal_op = -1; + + local = frame->local; + xdata_req = local->xdata_req; + ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op); + if (ret) + return _gf_false; + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) + return _gf_true; + } + ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg", + SFILE_NOT_IN_SPLIT_BRAIN); -static int -afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, - afr_node_character *characters, - int32_t *success_children, - int child_count, struct iatt *bufs) -{ - int32_t biggest_witness = 0; - int nsources = 0; - int32_t *witnesses = NULL; - - GF_ASSERT (child_count > 0); - - biggest_witness = afr_find_largest_file_size (bufs, success_children, - child_count); - if (biggest_witness != -1) - goto found; - - witnesses = GF_CALLOC (child_count, sizeof (*witnesses), - gf_afr_mt_int32_t); - if (NULL == witnesses) { - nsources = -1; - goto out; - } + return _gf_true; +} - afr_compute_witness_of_fools (witnesses, pending_matrix, characters, - child_count); - biggest_witness = afr_find_biggest_witness_among_fools (witnesses, - characters, - child_count); - if (biggest_witness != -1) - goto found; +gf_boolean_t +afr_can_decide_split_brain_source_sinks(struct afr_reply *replies, + int child_count) +{ + int i = 0; - biggest_witness = afr_find_newest_file (bufs, success_children, - child_count); + for (i = 0; i < child_count; i++) + if (replies[i].valid != 1 || replies[i].op_ret != 0) + return _gf_false; + + return _gf_true; +} -found: - nsources = afr_mark_fool_as_source_by_idx (sources, child_count, - biggest_witness); +int +afr_mark_split_brain_source_sinks_by_heal_op( + call_frame_t *frame, xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type, int heal_op) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int ret = 0; + int i = 0; + char *name = NULL; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + if (sources[i] || !sinks[i] || !healed_sinks[i]) { + ret = -1; + goto out; + } + } + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + ret = -1; + goto out; + } + } + xdata_rsp = local->xdata_rsp; + + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SBRAIN_HEAL_NO_GO_MSG); + ret = -1; + goto out; + } + + for (i = 0; i < priv->child_count; i++) + if (locked_on[i]) + sources[i] = 1; + switch (heal_op) { + case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SUSE_SOURCE_BRICK_TO_HEAL); + if (!ret) + ret = -1; + goto out; + } + afr_mark_largest_file_as_source(this, sources, replies); + if (AFR_COUNT(sources, priv->child_count) != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SNO_BIGGER_FILE); + if (!ret) + ret = -1; + goto out; + } + break; + case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SUSE_SOURCE_BRICK_TO_HEAL); + if (!ret) + ret = -1; + goto out; + } + afr_mark_latest_mtime_file_as_source(this, sources, replies); + if (AFR_COUNT(sources, priv->child_count) != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SNO_DIFF_IN_MTIME); + if (!ret) + ret = -1; + goto out; + } + break; + case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str_sizen(xdata_req, "child-name", &name); + if (ret) + goto out; + source = afr_get_child_index_from_name(this, name); + if (source < 0) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SINVALID_BRICK_NAME); + if (!ret) + ret = -1; + goto out; + } + if (locked_on[source] != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SBRICK_IS_NOT_UP); + if (!ret) + ret = -1; + goto out; + } + memset(sources, 0, sizeof(*sources) * priv->child_count); + sources[source] = 1; + break; + default: + ret = -1; + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } + sinks[source] = 0; + healed_sinks[source] = 0; + ret = source; out: - GF_FREE (witnesses); - return nsources; + if (ret < 0) + memset(sources, 0, sizeof(*sources) * priv->child_count); + return ret; } int -afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count, uint32_t uid) -{ - int i = 0; - int nsources = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (-1 == success_children[i]) - break; - - child = success_children[i]; - if (uid == bufs[child].ia_uid) { - sources[child] = 1; - nsources++; +afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies, + inode_t *inode) +{ + afr_private_t *priv; + int vote_count = -1; + int fav_child = -1; + int i = 0; + int k = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64 + " for gfid %s", + priv->children[i]->name, replies[i].poststat.ia_mtime, + replies[i].poststat.ia_size, uuid_utoa(inode->gfid)); + vote_count = 0; + for (k = 0; k < priv->child_count; k++) { + if ((replies[k].poststat.ia_mtime == + replies[i].poststat.ia_mtime) && + (replies[k].poststat.ia_size == + replies[i].poststat.ia_size)) { + vote_count++; } + } + if (vote_count > priv->child_count / 2) { + fav_child = i; + break; + } } - return nsources; + } + return fav_child; } +/* + * afr_sh_fav_by_mtime: Choose favorite child by mtime. + */ int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, - unsigned int child_count) -{ - int i = 0; - int smallest = -1; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (-1 == success_children[i]) - break; - child = success_children[i]; - if ((smallest == -1) || - (bufs[child].ia_uid < bufs[smallest].ia_uid)) { - smallest = child; - } - } - return smallest; +afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_mtime = 0; + uint32_t cmp_mtime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s mtime = %" PRId64 + ", mtime_nsec = %d for " + "gfid %s", + priv->children[i]->name, replies[i].poststat.ia_mtime, + replies[i].poststat.ia_mtime_nsec, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_mtime > cmp_mtime) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_mtime == cmp_mtime) && + (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } + } + } + return fav_child; } -static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int smallest = 0; +/* + * afr_sh_fav_by_ctime: Choose favorite child by ctime. + */ +int +afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_ctime = 0; + uint32_t cmp_ctime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s ctime = %" PRId64 + ", ctime_nsec = %d for " + "gfid %s", + priv->children[i]->name, replies[i].poststat.ia_ctime, + replies[i].poststat.ia_ctime_nsec, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_ctime > cmp_ctime) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_ctime == cmp_ctime) && + (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } + } + } + return fav_child; +} - smallest = afr_get_child_with_lowest_uid (bufs, success_children, - child_count); - if (smallest < 0) { - nsources = -1; - goto out; - } - nsources = afr_mark_child_as_source_by_uid (sources, bufs, - success_children, child_count, - bufs[smallest].ia_uid); -out: - return nsources; +/* + * afr_sh_fav_by_size: Choose favorite child by size + * when not all files are of zero size. + */ +int +afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint64_t cmp_sz = 0; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + continue; + } + gf_msg_debug(this->name, 0, + "Child:%s file size = %" PRIu64 " for gfid %s", + priv->children[i]->name, replies[i].poststat.ia_size, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_type == IA_IFDIR) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Cannot perform selfheal on %s. " + "Size policy is not applicable to directories.", + uuid_utoa(inode->gfid)); + break; + } + if (replies[i].poststat.ia_size > cmp_sz) { + cmp_sz = replies[i].poststat.ia_size; + fav_child = i; + } else if (replies[i].poststat.ia_size == cmp_sz) { + fav_child = -1; + } + } + if (fav_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "No bigger file"); + } + return fav_child; } int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, - struct iatt *bufs) +afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies, + inode_t *inode, char **policy_str) { - afr_private_t *priv = NULL; - int i = 0; - int child = -1; - int read_child = -1; + afr_private_t *priv = NULL; + int fav_child = -1; - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (read_child < 0) - read_child = child; - else if (bufs[read_child].ia_size < bufs[child].ia_size) - read_child = child; - } - return read_child; + priv = this->private; + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + return -1; + } + + switch (priv->fav_child_policy) { + case AFR_FAV_CHILD_BY_SIZE: + fav_child = afr_sh_fav_by_size(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "SIZE"; + } + break; + case AFR_FAV_CHILD_BY_CTIME: + fav_child = afr_sh_fav_by_ctime(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "CTIME"; + } + break; + case AFR_FAV_CHILD_BY_MTIME: + fav_child = afr_sh_fav_by_mtime(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "MTIME"; + } + break; + case AFR_FAV_CHILD_BY_MAJORITY: + fav_child = afr_sh_fav_by_majority(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "MAJORITY"; + } + break; + case AFR_FAV_CHILD_NONE: + default: + break; + } + + return fav_child; } int -afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int i = 0; - int child = 0; - gf_boolean_t sink_exists = _gf_false; - gf_boolean_t source_exists = _gf_false; - int source = -1; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (!bufs[child].ia_size) { - sink_exists = _gf_true; - continue; - } - if (!source_exists) { - source_exists = _gf_true; - source = child; - continue; - } - if (bufs[source].ia_size != bufs[child].ia_size) { - nsources = -1; - goto out; - } - } - if (!source_exists && !sink_exists) { - nsources = -1; - goto out; - } - - if (!source_exists || !sink_exists) - goto out; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (bufs[child].ia_size) { - sources[child] = 1; - nsources++; - } - } -out: - return nsources; +afr_mark_split_brain_source_sinks_by_policy( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int fav_child = -1; + char mtime_str[256]; + char ctime_str[256]; + char *policy_str = NULL; + struct tm *tm_ptr; + time_t time; + + priv = this->private; + + fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str); + if (fav_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "No child selected by favorite-child policy."); + } else if (fav_child > priv->child_count - 1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Invalid child (%d) " + "selected by policy %s.", + fav_child, policy_str); + } else if (fav_child >= 0) { + time = replies[fav_child].poststat.ia_mtime; + tm_ptr = localtime(&time); + strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr); + time = replies[fav_child].poststat.ia_ctime; + tm_ptr = localtime(&time); + strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr); + + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Source %s selected as authentic to resolve conflicting data " + "in file (gfid:%s) by %s (%" PRIu64 + " bytes @ %s mtime, %s " + "ctime).", + priv->children[fav_child]->name, uuid_utoa(inode->gfid), + policy_str, replies[fav_child].poststat.ia_size, mtime_str, + ctime_str); + + sources[fav_child] = 1; + sinks[fav_child] = 0; + healed_sinks[fav_child] = 0; + } + return fav_child; } -char * -afr_get_character_str (afr_node_type type) +gf_boolean_t +afr_is_file_empty_on_all_children(afr_private_t *priv, + struct afr_reply *replies) { - char *character = NULL; + int i = 0; - switch (type) { - case AFR_NODE_INNOCENT: - character = "innocent"; - break; - case AFR_NODE_FOOL: - character = "fool"; - break; - case AFR_NODE_WISE: - character = "wise"; - break; - default: - character = "invalid"; - break; - } - return character; + for (i = 0; i < priv->child_count; i++) { + if ((!replies[i].valid) || (replies[i].op_ret != 0) || + (replies[i].poststat.ia_size != 0)) + return _gf_false; + } + + return _gf_true; } -afr_node_type -afr_find_child_character_type (int32_t *pending_row, int32_t child, - unsigned int child_count) -{ - afr_node_type type = AFR_NODE_INVALID; +int +afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) +{ + int source = -1; + int i = 0; + afr_private_t *priv = this->private; + struct iatt stbuf = { + 0, + }; + + if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) || + (afr_success_count(replies, priv->child_count) < priv->child_count)) + return -1; - GF_ASSERT ((child >= 0) && (child < child_count)); + if (type == AFR_DATA_TRANSACTION) { + if (!afr_is_file_empty_on_all_children(priv, replies)) + return -1; + goto mark; + } + + /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/ + stbuf = replies[0].poststat; + for (i = 1; i < priv->child_count; i++) { + if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) || + (!IA_EQUAL(stbuf, replies[i].poststat, uid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, gid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, prot))) + return -1; + } + for (i = 1; i < priv->child_count; i++) { + if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata)) + return -1; + } + +mark: + /* data/metadata is same on all bricks. Pick one of them as source. Rest + * are sinks.*/ + for (i = 0; i < priv->child_count; i++) { + if (source == -1) { + source = i; + sources[i] = 1; + sinks[i] = 0; + healed_sinks[i] = 0; + continue; + } + sources[i] = 0; + sinks[i] = 1; + healed_sinks[i] = 1; + } + + return source; +} + +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + int heal_op = -1; + int ret = -1; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + source = afr_mark_source_sinks_if_file_empty( + this, sources, sinks, healed_sinks, locked_on, replies, type); + if (source >= 0) + return source; + + ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op); + if (ret) + goto autoheal; + + source = afr_mark_split_brain_source_sinks_by_heal_op( + frame, this, sources, sinks, healed_sinks, locked_on, replies, type, + heal_op); + return source; + +autoheal: + /* Automatically heal if fav_child_policy is set. */ + if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) { + source = afr_mark_split_brain_source_sinks_by_policy( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, type); + if (source != -1) { + ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1); + if (ret) + return -1; + } + } - if (afr_sh_is_innocent (pending_row, child_count)) - type = AFR_NODE_INNOCENT; - else if (afr_sh_is_fool (pending_row, child, child_count)) - type = AFR_NODE_FOOL; - else if (afr_sh_is_wise (pending_row, child, child_count)) - type = AFR_NODE_WISE; - return type; + return source; } int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type, - int32_t *subvol_status, gf_boolean_t ignore_ignorant) -{ - afr_private_t *priv = NULL; - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - int nsources = -1; - unsigned char *ignorant_subvols = NULL; - unsigned int child_count = 0; +_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, + unsigned char *locked_on, + struct afr_reply *replies) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + + if (!dict_get_sizen(local->xdata_req, "fav-child-policy")) + return 0; - priv = this->private; - child_count = priv->child_count; + xdata = dict_new(); + if (!xdata) + return -1; - if (afr_get_children_count (success_children, priv->child_count) == 0) - goto out; + input_dirty = alloca0(priv->child_count * sizeof(int)); + input_matrix = ALLOC_MATRIX(priv->child_count, int); + output_dirty = alloca0(priv->child_count * sizeof(int)); + output_matrix = ALLOC_MATRIX(priv->child_count, int); - if (!ignore_ignorant) { - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), - child_count, gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; - } + afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix); - afr_build_pending_matrix (priv->pending_key, pending_matrix, - ignorant_subvols, xattr, type, - priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; + output_dirty[i] = -input_dirty[i]; + output_matrix[i][source] = -input_matrix[i][source]; + } - if (!ignore_ignorant) - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - priv->child_count); - sh_type = afr_self_heal_type_for_transaction (type); - if (AFR_SELF_HEAL_INVALID == sh_type) - goto out; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] || !locked_on[i]) + continue; + xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty, + output_matrix, i, NULL); - afr_sh_print_pending_matrix (pending_matrix, this); + afr_selfheal_post_op(frame, this, inode, i, xattr, xdata); - nsources = afr_mark_sources (this, sources, pending_matrix, bufs, - sh_type, success_children, subvol_status); -out: - GF_FREE (ignorant_subvols); - return nsources; + undid_pending[i] = 1; + dict_unref(xattr); + } + + if (xdata) + dict_unref(xdata); + + return 0; } -void -afr_find_character_types (afr_node_character *characters, - int32_t **pending_matrix, int32_t *success_children, - unsigned int child_count) -{ - afr_node_type type = AFR_NODE_INVALID; - int child = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - type = afr_find_child_character_type (pending_matrix[child], - child, child_count); - characters[child].type = type; - } +gf_boolean_t +afr_does_witness_exist(xlator_t *this, uint64_t *witness) +{ + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (witness[i]) + return _gf_true; + } + return _gf_false; +} + +unsigned int +afr_get_quorum_count(afr_private_t *priv) +{ + if (priv->quorum_count == AFR_QUORUM_AUTO) { + return priv->child_count / 2 + 1; + } else { + return priv->quorum_count; + } } void -afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, - unsigned int child_count) -{ - int i = 0; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - sources[success_children[i]] = 1; +afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused, + unsigned char *sources, + unsigned char *locked_on) +{ + int i = 0; + unsigned int quorum_count = 0; + + if (AFR_COUNT(sources, priv->child_count) != 0) + return; + + quorum_count = afr_get_quorum_count(priv); + for (i = 0; i < priv->child_count; i++) { + if ((accused[i] < quorum_count) && locked_on[i]) { + sources[i] = 1; } + } + return; } -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found + +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. * - * A node (a row in the pending matrix) belongs to one of - * three categories: + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. * - * M is the pending matrix. + * The output of the function is by filling the arrays sources[] and sinks[]. * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. + * sources[i] is set if i'th server is an eligible source for a selfheal. * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. + * sinks[i] is set if i'th server needs to be healed. * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. */ int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, - struct iatt *bufs, afr_self_heal_type type, - int32_t *success_children, int32_t *subvol_status) -{ - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int nsources = -1; - unsigned int child_count = 0; - afr_private_t *priv = NULL; - - priv = this->private; - child_count = priv->child_count; - characters = GF_CALLOC (sizeof (afr_node_character), - child_count, gf_afr_mt_afr_node_character); - if (!characters) - goto out; - - this = THIS; - - /* start clean */ - memset (sources, 0, sizeof (*sources) * child_count); - nsources = 0; - afr_find_character_types (characters, pending_matrix, success_children, - child_count); - if (afr_sh_all_nodes_innocent (characters, child_count)) { - switch (type) { - case AFR_SELF_HEAL_METADATA: - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - success_children, - child_count, - sources); - break; - case AFR_SELF_HEAL_DATA: - nsources = afr_sh_mark_zero_size_file_as_sink (bufs, - success_children, - child_count, - sources); - if ((nsources < 0) && subvol_status) - *subvol_status |= SPLIT_BRAIN; - break; - default: - break; - } - goto out; +afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + uint64_t *witness, unsigned char *pflag) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + int *dirty = NULL; /* Denotes if dirty xattr is set */ + int **matrix = NULL; /* Changelog matrix */ + char *accused = NULL; /* Accused others without any self-accusal */ + char *pending = NULL; /* Have pending operations on others */ + char *self_accused = NULL; /* Accused itself */ + + priv = this->private; + + dirty = alloca0(priv->child_count * sizeof(int)); + accused = alloca0(priv->child_count); + pending = alloca0(priv->child_count); + self_accused = alloca0(priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); + memset(witness, 0, sizeof(*witness) * priv->child_count); + + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr(this, replies, type, dirty, matrix); + + if (pflag) { + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j]) + *pflag |= PFLAG_PENDING; + if (*pflag) + break; } - - if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); - - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - if (subvol_status) - *subvol_status |= SPLIT_BRAIN; - nsources = -1; - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); - } - } else { - if (subvol_status) - *subvol_status |= ALL_FOOLS; - nsources = afr_mark_biggest_of_fools_as_source (sources, - pending_matrix, - characters, - success_children, - child_count, bufs); + } + + if (afr_success_count(replies, priv->child_count) < priv->child_count) { + /* Treat this just like locks not being acquired */ + return -ENOTCONN; + } + + /* short list all self-accused */ + for (i = 0; i < priv->child_count; i++) { + if (matrix[i][i]) + self_accused[i] = 1; + } + + /* Next short list all accused to exclude them from being sources */ + /* Self-accused can't accuse others as they are FOOLs */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) { + if (!self_accused[i]) + accused[j] += 1; + if (i != j) + pending[i] += 1; + } + } + } + + /* Short list all non-accused as sources */ + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + else + sources[i] = 0; + } + + /* Everyone accused by non-self-accused sources are sinks */ + memset(sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (self_accused[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; + } + } + + /* For breaking ties provide with number of fops they witnessed */ + + /* + * count the pending fops witnessed from itself to others when it is + * self-accused + */ + for (i = 0; i < priv->child_count; i++) { + if (!self_accused[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (i == j) + continue; + witness[i] += matrix[i][j]; + } + } + + if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) + afr_selfheal_post_op_failure_accounting(priv, accused, sources, + locked_on); + + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT(sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + if (pflag) + *pflag |= PFLAG_SBRAIN; + } + + /* One more class of witness similar to dirty in v2 is where no pending + * exists but we have self-accusing markers. This can happen in afr-v1 + * if the brick crashes just after doing xattrop on self but + * before xattrop on the other xattrs on the brick in pre-op. */ + if (AFR_COUNT(pending, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (self_accused[i]) + witness[i] += matrix[i][i]; } + } else { + /* In afr-v1 if a file is self-accused and has pending + * operations on others then it is similar to 'dirty' in afr-v2. + * Consider such cases as witness. + */ + for (i = 0; i < priv->child_count; i++) { + if (self_accused[i] && pending[i]) + witness[i] += matrix[i][i]; + } + } -out: - if (nsources == 0) - afr_mark_success_children_sources (sources, success_children, - child_count); - GF_FREE (characters); + /* count the number of dirty fops witnessed */ + for (i = 0; i < priv->child_count; i++) + witness[i] += dirty[i]; - gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); - return nsources; + return 0; } void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], unsigned char success[], - int child_count, afr_transaction_type type) -{ - int tgt = 0; - int src = 0; - int value = 0; - - afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, - xattr, type, priv->child_count); - - /* - * The algorithm here has two parts. First, for each subvol indexed - * as tgt, we try to figure out what count everyone should have for it. - * If the self-heal succeeded, that's easy; the value is zero. - * Otherwise, the value is the maximum of the succeeding nodes' counts. - * Once we know the value, we loop through (possibly for a second time) - * setting each count to the difference so that when we're done all - * succeeding nodes will have the same count for tgt. - */ - for (tgt = 0; tgt < priv->child_count; ++tgt) { - value = 0; - if (!success[tgt]) { - /* Find the maximum. */ - for (src = 0; src < priv->child_count; ++src) { - if (!success[src]) { - continue; - } - if (delta_matrix[src][tgt] > value) { - value = delta_matrix[src][tgt]; - } - } - } - /* Force everyone who succeeded to the chosen value. */ - for (src = 0; src < priv->child_count; ++src) { - if (success[src]) { - delta_matrix[src][tgt] = value - - delta_matrix[src][tgt]; - } - else { - delta_matrix[src][tgt] = 0; - } - } - } +afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source, + unsigned char *sources, unsigned char *healed_sinks) +{ + char *status = NULL; + char *sinks_str = NULL; + char *p = NULL; + char *sources_str = NULL; + char *q = NULL; + afr_private_t *priv = NULL; + gf_loglevel_t loglevel = GF_LOG_NONE; + int i = 0; + + priv = this->private; + sinks_str = alloca0(priv->child_count * 8); + p = sinks_str; + sources_str = alloca0(priv->child_count * 8); + q = sources_str; + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i]) + p += sprintf(p, "%d ", i); + if (sources[i]) { + if (source == i) { + q += sprintf(q, "[%d] ", i); + } else { + q += sprintf(q, "%d ", i); + } + } + } + + if (ret < 0) { + status = "Failed"; + loglevel = GF_LOG_DEBUG; + } else { + status = "Completed"; + loglevel = GF_LOG_INFO; + } + + gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO, + "%s %s selfheal on %s. " + "sources=%s sinks=%s", + status, type, uuid_utoa(gfid), sources_str, sinks_str); } - int -afr_sh_delta_to_xattr (xlator_t *this, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) -{ - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; - int32_t *local_pending = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; - - local_pending = NULL; - for (j = 0; j < child_count; j++) { - pending = GF_CALLOC (sizeof (int32_t), 3, - gf_afr_mt_int32_t); - - if (!pending) { - gf_log (this->name, GF_LOG_ERROR, - "failed to allocate pending entry " - "for %s[%d] on %s", - priv->pending_key[j], type, - priv->children[i]->name); - continue; - } - /* 3 = data+metadata+entry */ - - k = afr_index_for_transaction_type (type); - - pending[k] = hton32 (delta_matrix[i][j]); - - if (j == i) { - local_pending = pending; - continue; - } - ret = dict_set_bin (xattr[i], priv->pending_key[j], - pending, - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - GF_FREE (pending); - } - } - if (local_pending) { - ret = dict_set_bin (xattr[i], priv->pending_key[i], - local_pending, - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - GF_FREE (local_pending); - } - } - } - return 0; -} +afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) +{ + afr_local_t *local = NULL; + int i = -1; + GF_UNUSED int ret = -1; + int8_t need_heal = 1; + local = frame->local; + i = (long)cookie; -int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + ret = dict_get_int8(xdata, "link-count", &need_heal); + } - local = frame->local; - sh = &local->self_heal; + local->replies[i].need_heal = need_heal; + syncbarrier_wake(&local->barrier); - afr_sh_reset (frame, this); + return 0; +} - if (local->unhealable) { - gf_log (this->name, GF_LOG_DEBUG, - "split brain found, aborting selfheal of %s", - local->loc.path); - } +inode_t * +afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on, dict_t *xattr) +{ + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } + local = frame->local; + priv = frame->this->private; - return 0; -} + xattr_req = dict_new(); + if (!xattr_req) + return NULL; + if (xattr) + dict_copy(xattr, xattr_req); -static int -afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); + return NULL; + } - local = frame->local; - int_lock = &local->internal_lock; + inode = inode_new(parent->table); + if (!inode) { + dict_unref(xattr_req); + return NULL; + } - int_lock->lock_cbk = afr_sh_missing_entries_done; - afr_unlock (frame, this); + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref(inode); - return 0; -} + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); -int -afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) -{ - int ret = -ENOMEM; - sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), - gf_afr_mt_iatt); - if (!sh->buf) - goto out; - sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), - gf_afr_mt_iatt); - if (!sh->parentbufs) - goto out; - sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), - gf_afr_mt_int); - if (!sh->child_errno) - goto out; - sh->success_children = afr_children_create (child_count); - if (!sh->success_children) - goto out; - sh->fresh_children = afr_children_create (child_count); - if (!sh->fresh_children) - goto out; - sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), - gf_afr_mt_dict_t); - if (!sh->xattr) - goto out; - ret = 0; -out: - return ret; -} + afr_replies_copy(replies, local->replies, priv->child_count); -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent, - loc_t *loc) -{ - int child_index = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - sh->buf[child_index] = *buf; - sh->parentbufs[child_index] = *postparent; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - sh->xattr[child_index] = dict_ref (xattr); - } else { - gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" - " %s => -1 (%s)", loc->path, - priv->children[child_index]->name, - strerror (op_errno)); - local->self_heal.child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - return; -} + loc_wipe(&loc); + dict_unref(xattr_req); -gf_boolean_t -afr_valid_ia_type (ia_type_t ia_type) -{ - switch (ia_type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - case IA_IFDIR: - return _gf_true; - default: - return _gf_false; - } - return _gf_false; + return inode; } -int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, call_frame_t **impunge_frame) +static int +afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict) { - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t op_errno = 0; - afr_private_t *priv = NULL; - int ret = 0; - call_frame_t *new_frame = NULL; + int ret = 0; + afr_private_t *priv = NULL; + char *key1 = NULL; + char *key2 = NULL; - op_errno = ENOMEM; - priv = this->private; - new_frame = copy_frame (frame); - if (!new_frame) { - goto out; - } + priv = this->private; + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); - AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); - - local = frame->local; - new_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = active_source; - impunge_local->child_up = memdup (local->child_up, - sizeof (*local->child_up) * - priv->child_count); - if (!impunge_local->child_up) - goto out; + ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1); + if (ret) + return ret; - impunge_local->pending = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!impunge_local->pending) - goto out; + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + ret = dict_set_uint32(dict, key1, 1); + if (ret) + return ret; - ret = afr_sh_common_create (impunge_sh, priv->child_count); - if (ret) { - op_errno = -ret; - goto out; - } - op_errno = 0; - *impunge_frame = new_frame; -out: - if (op_errno && new_frame) - AFR_STACK_DESTROY (new_frame); - return -op_errno; -} + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); + ret = dict_set_uint32(dict, key2, 1); + if (ret) + return ret; -void -afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, - struct iatt *buf, - struct iatt *postparent, - afr_impunge_done_cbk_t impunge_done) -{ - call_frame_t *impunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int ret = 0; - unsigned int enoent_count = 0; - afr_private_t *priv = NULL; - int i = 0; - int32_t op_errno = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (!enoent_count) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - goto out; - } - sh->impunge_done = impunge_done; - ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); - if (ret) - goto out; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - loc_copy (&impunge_local->loc, &local->loc); - ret = afr_build_parent_loc (&impunge_sh->parent_loc, - &impunge_local->loc, &op_errno); - if (ret) { - ret = -op_errno; - goto out; - } - impunge_local->call_count = enoent_count; - impunge_sh->entrybuf = sh->buf[sh->source]; - impunge_sh->parentbuf = sh->parentbufs[sh->source]; - for (i = 0; i < priv->child_count; i++) { - if (!impunge_local->child_up[i]) { - impunge_sh->child_errno[i] = ENOTCONN; - continue; - } - if (sh->child_errno[i] != ENOENT) { - impunge_sh->child_errno[i] = EEXIST; - continue; - } - } - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] != ENOENT) - continue; - afr_sh_entry_impunge_create (impunge_frame, this, i); - enoent_count--; - } - GF_ASSERT (!enoent_count); - return; -out: - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " - "reason: %s", local->loc.path, strerror (-ret)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - afr_sh_missing_entries_finish (frame, this); + return 0; } int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; +afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on, dict_t *dict) +{ + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = frame->this->private; + + xattr_req = dict_new(); + if (!xattr_req) + return -ENOMEM; + if (dict) + dict_copy(dict, xattr_req); + + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); + return -ENOMEM; + } + + if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) { + dict_unref(xattr_req); + return -1; + } - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_finish (frame, this); - return 0; -} + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, gfid); -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - - local = frame->local; - sh = &local->self_heal; - - buf = &sh->buf[sh->source]; - postparent = &sh->parentbufs[sh->source]; - - type = buf->ia_type; - if (!afr_valid_ia_type (type)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unknown file type: 0%o", local->loc.path, type); - afr_set_local_for_unhealable (local); - afr_sh_missing_entries_finish (frame, this); - goto out; - } + AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - afr_sh_missing_entry_call_impunge_recreate (frame, this, - buf, postparent, - afr_sh_create_entry_cbk); -out: - return 0; -} + afr_replies_copy(replies, local->replies, priv->child_count); -void -afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t ia_type = IA_INVAL; - int32_t nsources = 0; - loc_t *loc = NULL; - int32_t subvol_status = 0; - afr_transaction_type txn_type = AFR_DATA_TRANSACTION; - gf_boolean_t split_brain = _gf_false; - int read_child = -1; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - loc = &local->loc; - - if (op_ret < 0) { - if (op_errno == EIO) { - afr_set_local_for_unhealable (local); - } - // EIO can happen if finding the fresh parent dir failed - goto out; - } + loc_wipe(&loc); + dict_unref(xattr_req); - //now No chance for the ia_type to conflict - ia_type = sh->buf[sh->success_children[0]].ia_type; - txn_type = afr_transaction_type_get (ia_type); - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, txn_type, - &subvol_status, _gf_false); - if (nsources < 0) { - gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - if (subvol_status & SPLIT_BRAIN) { - split_brain = _gf_true; - switch (txn_type) { - case AFR_DATA_TRANSACTION: - nsources = 1; - sh->sources[sh->success_children[0]] = 1; - break; - case AFR_ENTRY_TRANSACTION: - read_child = afr_get_no_xattr_dir_read_child - (this, - sh->success_children, - sh->buf); - sh->sources[read_child] = 1; - nsources = 1; - break; - default: - op_errno = EIO; - goto out; - } - } else { - op_errno = EIO; - goto out; - } - } + return 0; +} - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - sh->source = sh->fresh_children[0]; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - op_errno = EIO; - goto out; - } +int +afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, + struct afr_reply *replies) +{ + afr_local_t *local = NULL; + dict_t *dict = NULL; - if (sh->gfid_sh_success_cbk) - sh->gfid_sh_success_cbk (frame, this); - sh->type = sh->buf[sh->source].ia_type; - if (uuid_is_null (loc->inode->gfid)) - uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); - if (split_brain) { - afr_sh_missing_entries_finish (frame, this); - } else { - sh_missing_entries_create (frame, this); - } - return; -out: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; -} + local = frame->local; -static int -afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &sh->lookup_loc); - call_count = afr_frame_return (frame); - - if (call_count) - goto out; - op_ret = -1; - if (!sh->success_count) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); - gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " - "reason %s", sh->lookup_loc.path, - strerror (op_errno)); - goto done; - } + if (local->xattr_req) + dict = local->xattr_req; - if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && - (afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, - sh->lookup_loc.path, this->name))) { - op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " - "for %s", sh->lookup_loc.path); - goto done; - } + return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies, + local->child_up, dict); +} - if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && - (afr_gfid_missing_count (this->name, sh->success_children, - sh->buf, priv->child_count, - sh->lookup_loc.path))) { - op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " - "for %s", sh->lookup_loc.path); - goto done; - } - op_ret = 0; +unsigned int +afr_success_count(struct afr_reply *replies, unsigned int count) +{ + int i = 0; + unsigned int success = 0; -done: - sh->lookup_done (frame, this, op_ret, op_errno); -out: - return 0; + for (i = 0; i < count; i++) + if (replies[i].valid && replies[i].op_ret == 0) + success++; + return success; } int -afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, - int32_t op_ret, int32_t op_errno) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->post_remove_call); - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "purge entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - LOCK (&frame->lock); - { - afr_sh_set_error (sh, EIO); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - UNLOCK (&frame->lock); - } - call_count = afr_frame_return (frame); - if (call_count == 0) - sh->post_remove_call (frame, this); - return 0; -} +afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int i = 0; -void -afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *parentbuf, - afr_expunge_done_cbk_t expunge_done) -{ - call_frame_t *expunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *expunge_sh = NULL; - int32_t op_errno = 0; - int ret = 0; - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - goto out; - } + local = frame->local; + i = (long)cookie; - AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - local = frame->local; - sh = &local->self_heal; - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - loc_copy (&expunge_local->loc, &local->loc); - ret = afr_build_parent_loc (&expunge_sh->parent_loc, - &expunge_local->loc, &op_errno); - if (ret) { - ret = -op_errno; - goto out; - } - sh->expunge_done = expunge_done; - afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, - parentbuf); - return; -out: - gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", - local->loc.path, strerror (op_errno)); - expunge_done (frame, this, child_index, -1, op_errno); -} + syncbarrier_wake(&local->barrier); -void -afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children, - int32_t *fresh_children, - unsigned int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) { - if (afr_is_child_present (success_children, child_count, i) && - !afr_is_child_present (fresh_children, child_count, i)) { - sh->child_errno[i] = ENOENT; - GF_ASSERT (sh->xattr[i]); - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } + return 0; } int -afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) +afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + local = frame->local; + priv = this->private; - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_missing_entries_finish (frame, this); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; } else { - if (afr_gfid_missing_count (this->name, sh->fresh_children, - sh->buf, priv->child_count, - local->loc.path)) { - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_done, - sh->sh_gfid_req, - AFR_LOOKUP_FAIL_CONFLICTS| - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } else { - //No need to set gfid so goto missing entries lookup done - //Behave as if you have done the lookup - afr_sh_remove_stale_lookup_info (sh, - sh->success_children, - sh->fresh_children, - priv->child_count); - afr_children_copy (sh->success_children, - sh->fresh_children, - priv->child_count); - afr_sh_missing_entries_lookup_done (frame, this, 0, 0); - } + locked_on[i] = 0; } - return 0; + } + + return count; } -gf_boolean_t -afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv, - int child) +int +afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - afr_self_heal_t *sh = NULL; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; - sh = &local->self_heal; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - if (local->child_up[child] && - (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count, - child)) - && (sh->child_errno[child] != ENOENT)) - return _gf_true; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - return _gf_false; + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); + + loc_wipe(&loc); + + return afr_locked_fill(frame, this, locked_on); } -gf_boolean_t -afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv, - int child) +int +afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - afr_self_heal_t *sh = NULL; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - sh = &local->self_heal; + priv = this->private; + local = frame->local; - if (local->child_up[child] && - (!afr_is_child_present (sh->fresh_children, priv->child_count, - child)) - && (sh->child_errno[child] != ENOENT)) - return _gf_true; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - return _gf_false; -} + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; -void -afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, - gf_boolean_t purge_condition (afr_local_t *local, - afr_private_t *priv, - int child)) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - local = frame->local; - sh = &local->self_heal; - priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_uninodelk(frame, this, inode, dom, off, size, + locked_on); - for (i = 0; i < priv->child_count; i++) { - if (purge_condition (local, priv, i)) - call_count++; + AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW, + &flock, NULL); + break; } + } - if (call_count == 0) { - sh->post_remove_call (frame, this); - goto out; - } + loc_wipe(&loc); - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!purge_condition (local, priv, i)) - continue; - gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " - "on %s", local->loc.path, priv->children[i]->name); - afr_sh_call_entry_expunge_remove (frame, this, - (long) i, &sh->buf[i], - &sh->parentbufs[i], - afr_sh_remove_entry_cbk); - } -out: - return; + return afr_locked_fill(frame, this, locked_on); } -void -afr_sh_purge_entry (call_frame_t *frame, xlator_t *this) +static void +afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies, + int *lock_count, int *eagain_count) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - sh->post_remove_call = afr_sh_missing_entries_finish; + int i = 0; - afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition); + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == 0) { + (*lock_count)++; + } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) { + (*eagain_count)++; + } + } } -void -afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this) +/*Do blocking locks if number of locks acquired is majority and there were some + * EAGAINs. Useful for odd-way replication*/ +int +afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, off_t off, + size_t size, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lock_count = 0; + int eagain_count = 0; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + priv = this->private; + local = frame->local; - sh->post_remove_call = afr_sh_purge_stale_entries_done; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - for (i = 0; i < priv->child_count; i++) { - if (afr_is_child_present (sh->fresh_children, - priv->child_count, i)) - continue; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - if ((!local->child_up[i]) || sh->child_errno[i] != 0) - continue; + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) || - uuid_is_null (sh->buf[i].ia_gfid)); + afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count, + &eagain_count); - if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) || - (uuid_compare (sh->buf[i].ia_gfid, - sh->entrybuf.ia_gfid))) - continue; + if (lock_count > priv->child_count / 2 && eagain_count) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on); - afr_children_add_child (sh->fresh_children, i, - priv->child_count); + AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW, + &flock, NULL); + } - } - afr_sh_purge_entry_common (frame, this, - afr_sh_purge_stale_entry_condition); -} + loc_wipe(&loc); -void -afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, - struct iatt *save, - unsigned int child_count) -{ - int i = 0; - int child = 0; - gf_boolean_t saved = _gf_false; - - GF_ASSERT (save); - //if iatt buf with gfid exists sets it - for (i = 0; i < child_count; i++) { - child = children[i]; - if (child == -1) - break; - *save = bufs[child]; - saved = _gf_true; - if (!uuid_is_null (save->ia_gfid)) - break; - } - GF_ASSERT (saved); + return afr_locked_fill(frame, this, locked_on); } -void -afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh, - unsigned int child_count) +int +afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) { - afr_children_intersection_get (sh->success_children, - sh->fresh_parent_dirs, - sh->sources, child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, child_count); - memset (sh->sources, 0, sizeof (*sh->sources) * child_count); -} + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; -void -afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t fresh_child_enoents = 0; - int32_t fresh_parent_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) - goto fail; - afr_get_children_of_fresh_parent_dirs (sh, priv->child_count); - fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, - priv->child_count); - //we need the enoent count of the subvols present in fresh_parent_dirs - fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs, - sh->child_errno, - priv->child_count, ENOENT); - if (fresh_child_enoents == fresh_parent_count) { - afr_sh_set_error (sh, ENOENT); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_purge_entry (frame, this); - } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, - priv->child_count, local->loc.path, - this->name)) { - afr_sh_save_child_iatts_from_policy (sh->fresh_children, - sh->buf, &sh->entrybuf, - priv->child_count); - afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf, - sh->fresh_children, - priv->child_count); - afr_sh_purge_stale_entry (frame, this); - } else { - op_errno = EIO; - afr_set_local_for_unhealable (local); - goto fail; - } + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - return; + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; -fail: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; -} + AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, + F_SETLK, &flock, NULL); -static void -afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int enoent_count = 0; - int nsources = 0; - int source = -1; - int32_t subvol_status = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) - goto out; - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (enoent_count > 0) { - gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," - " in missing entry self-heal, aborting missing-entry " - "self-heal", - local->loc.path); - afr_sh_missing_entries_finish (frame, this); - return; - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_ENTRY_TRANSACTION, &subvol_status, - _gf_true); - if ((subvol_status & ALL_FOOLS) || - (subvol_status & SPLIT_BRAIN)) { - gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " - "merge", sh->parent_loc.path); - afr_mark_success_children_sources (sh->sources, - sh->success_children, - priv->child_count); - } else if (nsources < 0) { - gf_log (this->name, GF_LOG_ERROR, "No sources for dir " - "of %s, in missing entry self-heal, aborting " - "self-heal", local->loc.path); - op_errno = EIO; - goto out; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - if (source == -1) { - GF_ASSERT (0); - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - op_errno = EIO; - goto out; - } - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_parent_dirs, priv->child_count); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_children_lookup_done, NULL, 0, - NULL); - return; + loc_wipe(&loc); -out: - afr_sh_set_error (sh, op_errno); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_finish (frame, this); - return; + return 0; } -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) +int +afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - int i = 0; + loc_t loc = { + 0, + }; - for (i = 0; i < child_count; i++) { - memset (&sh->buf[i], 0, sizeof (sh->buf[i])); - memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i])); - sh->child_errno[i] = 0; - } - memset (&sh->parentbuf, 0, sizeof (sh->parentbuf)); - sh->success_count = 0; - afr_reset_children (sh->success_children, child_count); - afr_reset_children (sh->fresh_children, child_count); - afr_reset_xattr (sh->xattr, child_count); - loc_wipe (&sh->lookup_loc); -} - -/* afr self-heal state will be lost if this call is made - * please check the afr_sh_common_reset that is called in this function - */ -int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_done_cbk_t lookup_done , uuid_t gfid, - int32_t flags, dict_t *xdata) -{ - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) { - afr_xattr_req_prepare (this, xattr_req, loc->path); - if (gfid) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s with gfid: %s", - loc->path, uuid_utoa (gfid)); - GF_ASSERT (!uuid_is_null (gfid)); - afr_set_dict_gfid (xattr_req, gfid); - } - } + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - afr_sh_common_reset (sh, priv->child_count); - sh->lookup_done = lookup_done; - loc_copy (&sh->lookup_loc, loc); - sh->lookup_flags = flags; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s on subvolume %s", - loc->path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, - afr_sh_common_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, xattr_req); - - if (!--call_count) - break; - } - } + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - if (xattr_req) - dict_unref (xattr_req); + loc_wipe(&loc); - return 0; + return afr_locked_fill(frame, this, locked_on); } +int +afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + priv = this->private; + local = frame->local; -int -afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, - xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_done (frame, this); - } else { + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &sh->parent_loc, - afr_sh_find_fresh_parents, - NULL, AFR_LOOKUP_FAIL_CONFLICTS, - NULL); + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, + NULL); + + AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; } + } - return 0; + loc_wipe(&loc); + + return afr_locked_fill(frame, this, locked_on); } int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *base_name, afr_lock_cbk_t lock_cbk) +afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, const char *name, + unsigned char *locked_on) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lock_count = 0; + int eagain_count = 0; - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + priv = this->private; + local = frame->local; - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - afr_set_lock_number (frame, this); + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - int_lock->lk_basename = base_name; - int_lock->lk_loc = loc; - int_lock->lock_cbk = lock_cbk; - int_lock->domain = this->name; + afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count, + &eagain_count); - int_lock->lockee_count = 0; - afr_init_entry_lockee (&int_lock->lockee[0], local, loc, - base_name, priv->child_count); - int_lock->lockee_count++; - afr_nonblocking_entrylk (frame, this); + if (lock_count > priv->child_count / 2 && eagain_count) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL); - return 0; + AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } + + loc_wipe(&loc); + + return afr_locked_fill(frame, this, locked_on); } -static int -afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, - afr_lock_cbk_t lock_cbk) +int +afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_internal_lock_t *int_lock = NULL; - int ret = -1; - int32_t op_errno = 0; + loc_t loc = { + 0, + }; - local = frame->local; - sh = &local->self_heal; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - gf_log (this->name, GF_LOG_TRACE, - "attempting to recreate missing entries for path=%s", - local->loc.path); + AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); - ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); - if (ret) - goto out; + loc_wipe(&loc); - afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, - lock_cbk); - return 0; -out: - int_lock = &local->internal_lock; - int_lock->lock_op_ret = -1; - lock_cbk (frame, this); - return 0; + return 0; } -static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_data_set(xlator_t *this, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_missing_entry_sh_cbk); - return 0; + return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION); } -afr_local_t* -afr_self_heal_local_init (afr_local_t *l, xlator_t *this) +gf_boolean_t +afr_is_metadata_set(xlator_t *this, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; - int ret = 0; + return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION); +} - priv = this->private; +gf_boolean_t +afr_is_entry_set(xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION); +} - sh = &l->self_heal; +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection. + */ - lc = mem_get0 (this->local_pool); - if (!lc) - goto out; +int +afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **link_inode, gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal, + struct afr_reply *replies_dst) +{ + afr_private_t *priv = NULL; + inode_t *inode = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = { + 0, + }; + int first_idx = 0; + struct afr_reply *replies = NULL; + int ret = -1; + + priv = this->private; + + inode = afr_inode_find(this, gfid); + if (!inode) + goto out; + + replies = alloca0(sizeof(*replies) * priv->child_count); + + ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies); + if (ret) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; + + /* The data segment of the changelog can be non-zero to indicate + * the directory needs a full heal. So the check below ensures + * it's not a directory before setting the data_selfheal boolean. + */ + if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) && + afr_is_data_set(this, replies[i].xdata)) + *data_selfheal = _gf_true; + + if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata)) + *metadata_selfheal = _gf_true; + + if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata)) + *entry_selfheal = _gf_true; + + valid_cnt++; + if (valid_cnt == 1) { + first = replies[i].poststat; + first_idx = i; + continue; + } + + if (!IA_EQUAL(first, replies[i].poststat, type)) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int)first.ia_type, (int)replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;" + "type=file;gfid=%s;" + "ia_type-%d=%s;ia_type-%d=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(replies[i].poststat.ia_gfid), first_idx, + gf_inode_type_to_str(first.ia_type), i, + gf_inode_type_to_str(replies[i].poststat.ia_type)); + ret = -EIO; + goto out; + } + + if (!IA_EQUAL(first, replies[i].poststat, uid)) { + gf_msg_debug(this->name, 0, + "UID mismatch " + "%d vs %d on %s for gfid:%s", + (int)first.ia_uid, (int)replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (!IA_EQUAL(first, replies[i].poststat, gid)) { + gf_msg_debug(this->name, 0, + "GID mismatch " + "%d vs %d on %s for gfid:%s", + (int)first.ia_uid, (int)replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (!IA_EQUAL(first, replies[i].poststat, prot)) { + gf_msg_debug(this->name, 0, + "MODE mismatch " + "%d vs %d on %s for gfid:%s", + (int)st_mode_from_ia(first.ia_prot, 0), + (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (IA_ISREG(first.ia_type) && + !IA_EQUAL(first, replies[i].poststat, size)) { + gf_msg_debug(this->name, 0, + "SIZE mismatch " + "%lld vs %lld on %s for gfid:%s", + (long long)first.ia_size, + (long long)replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (data_selfheal) + *data_selfheal = _gf_true; + } + } + + if (valid_cnt > 0 && link_inode) { + *link_inode = inode_link(inode, NULL, NULL, &first); + if (!*link_inode) { + ret = -EINVAL; + goto out; + } + } else if (valid_cnt < 2) { + ret = afr_check_stale_error(replies, priv); + goto out; + } + + ret = 0; +out: + if (replies && replies_dst) + afr_replies_copy(replies_dst, replies, priv->child_count); + if (inode) + inode_unref(inode); + if (replies) + afr_replies_wipe(replies, priv->child_count); - shc = &lc->self_heal; - - shc->unwind = sh->unwind; - shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk; - shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal; - shc->do_gfid_self_heal = sh->do_gfid_self_heal; - shc->do_data_self_heal = sh->do_data_self_heal; - shc->do_metadata_self_heal = sh->do_metadata_self_heal; - shc->do_entry_self_heal = sh->do_entry_self_heal; - shc->force_confirm_spb = sh->force_confirm_spb; - shc->forced_merge = sh->forced_merge; - shc->background = sh->background; - shc->type = sh->type; - shc->data_sh_info = ""; - shc->metadata_sh_info = ""; - - uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) { - ret = loc_copy (&lc->loc, &l->loc); - if (ret < 0) - goto out; - } + return ret; +} - lc->child_up = memdup (l->child_up, - sizeof (*lc->child_up) * priv->child_count); - if (!lc->child_up) { - ret = -1; - goto out; - } +inode_t * +afr_inode_find(xlator_t *this, uuid_t gfid) +{ + inode_table_t *table = NULL; + inode_t *inode = NULL; - if (l->xattr_req) - lc->xattr_req = dict_ref (l->xattr_req); + table = this->itable; + if (!table) + return NULL; - if (l->cont.lookup.inode) - lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); - if (l->cont.lookup.xattr) - lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); + inode = inode_find(table, gfid); + if (inode) + return inode; - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, gf_afr_mt_char); - if (!lc->internal_lock.locked_nodes) { - ret = -1; - goto out; - } + inode = inode_new(table); + if (!inode) + return NULL; - ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], - this->name, priv->child_count); - if (ret) - goto out; + gf_uuid_copy(inode->gfid, gfid); -out: - if (ret) { - afr_local_cleanup (lc, this); - lc = NULL; - } - return lc; + return inode; } -int -afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_local_t * orig_frame_local = NULL; - afr_self_heal_t * orig_frame_sh = NULL; - char sh_type_str[256] = {0,}; - gf_loglevel_t loglevel = 0; - - priv = this->private; - local = bgsh_frame->local; - sh = &local->self_heal; - - if (local->unhealable) { - afr_set_split_brain (this, sh->inode, SPB, SPB); - } - - afr_self_heal_type_str_get (sh, sh_type_str, - sizeof(sh_type_str)); - if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { - loglevel = GF_LOG_ERROR; - } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { - loglevel = GF_LOG_INFO; - } else { - loglevel = GF_LOG_DEBUG; - } +call_frame_t * +afr_frame_create(xlator_t *this, int32_t *op_errno) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + pid_t pid = GF_CLIENT_PID_SELF_HEALD; - afr_log_self_heal_completion_status (local, loglevel); + frame = create_frame(this, this->ctx->pool); + if (!frame) { + if (op_errno) + *op_errno = ENOMEM; + return NULL; + } - FRAME_SU_UNDO (bgsh_frame, afr_local_t); + local = AFR_FRAME_INIT(frame, (*op_errno)); + if (!local) { + STACK_DESTROY(frame->root); + return NULL; + } - if (!sh->unwound && sh->unwind) { - orig_frame_local = sh->orig_frame->local; - orig_frame_sh = &orig_frame_local->self_heal; - orig_frame_sh->actual_sh_started = _gf_true; - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - is_self_heal_failed (sh, AFR_CHECK_ALL)); - } + syncopctx_setfspid(&pid); - if (sh->background) { - LOCK (&priv->lock); - { - priv->background_self_heals_started--; - } - UNLOCK (&priv->lock); - } + frame->root->pid = pid; - AFR_STACK_DESTROY (bgsh_frame); + afr_set_lk_owner(frame, this, frame->root); - return 0; + return frame; } int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t op_errno = 0; - int ret = 0; - afr_self_heal_t *orig_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - loc_t *loc = NULL; - - local = frame->local; - orig_sh = &local->self_heal; - priv = this->private; - - GF_ASSERT (local->loc.path); - - gf_log (this->name, GF_LOG_TRACE, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->self_heal.do_metadata_self_heal, - local->self_heal.do_data_self_heal, - local->self_heal.do_entry_self_heal); - - op_errno = ENOMEM; - sh_frame = copy_frame (frame); - if (!sh_frame) - goto out; - afr_set_lk_owner (sh_frame, this, sh_frame->root); - afr_set_low_priority (sh_frame); - - sh_local = afr_self_heal_local_init (local, this); - if (!sh_local) - goto out; - sh_frame->local = sh_local; - sh = &sh_local->self_heal; - - sh->inode = inode_ref (inode); - sh->orig_frame = frame; - - sh->completion_cbk = afr_self_heal_completion_cbk; - - sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), - gf_afr_mt_char); - if (!sh->success) - goto out; - sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, - gf_afr_mt_int); - if (!sh->sources) - goto out; - sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), - priv->child_count, - gf_afr_mt_int); - if (!sh->locked_nodes) - goto out; - - sh->pending_matrix = afr_matrix_create (priv->child_count, - priv->child_count); - if (!sh->pending_matrix) - goto out; - - sh->delta_matrix = afr_matrix_create (priv->child_count, - priv->child_count); - if (!sh->delta_matrix) - goto out; - - sh->fresh_parent_dirs = afr_children_create (priv->child_count); - if (!sh->fresh_parent_dirs) - goto out; - ret = afr_sh_common_create (sh, priv->child_count); - if (ret) { - op_errno = -ret; - goto out; - } +afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry) +{ + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int **changelog = NULL; - if (local->self_heal.background) { - LOCK (&priv->lock); - { - if (priv->background_self_heals_started - < priv->background_self_heal_count) { - priv->background_self_heals_started++; + priv = this->private; + gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid); - } else { - local->self_heal.background = _gf_false; - sh->background = _gf_false; - } - } - UNLOCK (&priv->lock); - } + xattr = dict_new(); + if (!xattr) + return -ENOMEM; - if (!local->loc.parent) { - sh->do_missing_entry_self_heal = _gf_false; - sh->do_gfid_self_heal = _gf_false; - } + changelog = afr_mark_pending_changelog(priv, newentry, xattr, + replies[source].poststat.ia_type); - sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; + if (!changelog) { + ret = -ENOMEM; + goto out; + } - FRAME_SU_DO (sh_frame, afr_local_t); - if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { - afr_self_heal_missing_entries (sh_frame, this); - } else { - loc = &sh_local->loc; - if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { - if (!uuid_is_null (inode->gfid)) - GF_ASSERT (!uuid_compare (inode->gfid, - sh->sh_gfid_req)); - uuid_copy (loc->gfid, sh->sh_gfid_req); - } - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + } +out: + if (changelog) + afr_matrix_cleanup(changelog, priv->child_count); + if (xattr) + dict_unref(xattr); + return ret; +} - afr_sh_missing_entries_done (sh_frame, this); - } - op_errno = 0; +int +afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid) +{ + int ret = -1; + int entry_ret = 1; + int metadata_ret = 1; + int data_ret = 1; + int or_ret = 0; + inode_t *inode = NULL; + fd_t *fd = NULL; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode, + &data_selfheal, &metadata_selfheal, + &entry_selfheal, NULL); + if (ret) + goto out; + + if (!(data_selfheal || metadata_selfheal || entry_selfheal)) { + ret = 2; + goto out; + } + + if (inode->ia_type == IA_IFREG) { + ret = afr_selfheal_data_open(this, inode, &fd); + if (!fd) { + ret = -EIO; + goto out; + } + } + + if (data_selfheal && priv->data_self_heal) + data_ret = afr_selfheal_data(frame, this, fd); + + if (metadata_selfheal && priv->metadata_self_heal) + metadata_ret = afr_selfheal_metadata(frame, this, inode); + + if (entry_selfheal && priv->entry_self_heal) + entry_ret = afr_selfheal_entry(frame, this, inode); + + or_ret = (data_ret | metadata_ret | entry_ret); + + if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO) + ret = -EIO; + else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1) + ret = 1; + else if (or_ret < 0) + ret = or_ret; + else + ret = 0; out: - if (op_errno) { - orig_sh->unwind (frame, this, -1, op_errno, 1); - if (sh_frame) - AFR_STACK_DESTROY (sh_frame); - } - return 0; + if (inode) + inode_unref(inode); + if (fd) + fd_unref(fd); + return ret; } +/* + * This is the entry point for healing a given GFID. The return values for this + * function are as follows: + * '0' if the self-heal is successful + * '1' if the afr-xattrs are non-zero (due to on-going IO) and no heal is needed + * '2' if the afr-xattrs are all-zero and no heal is needed + * $errno if the heal on the gfid failed. + */ -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size) +int +afr_selfheal(xlator_t *this, uuid_t gfid) { - GF_ASSERT (str && (size > strlen (" missing-entry gfid " - "meta-data data entry"))); + int ret = -1; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; - if (self_heal_p->do_metadata_self_heal) { - snprintf (str, size, " meta-data"); - } + frame = afr_frame_create(this, NULL); + if (!frame) + return ret; - if (self_heal_p->do_data_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " data"); - } + local = frame->local; + local->xdata_req = dict_new(); - if (self_heal_p->do_entry_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " entry"); - } + ret = afr_selfheal_do(frame, this, gfid); - if (self_heal_p->do_missing_entry_self_heal) { - snprintf (str + strlen(str), size - strlen(str), - " missing-entry"); - } + if (frame) + AFR_STACK_DESTROY(frame); - if (self_heal_p->do_gfid_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " gfid"); - } + return ret; } -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type) +afr_local_t * +__afr_dequeue_heals(afr_private_t *priv) { - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; + afr_local_t *local = NULL; - switch (type) { - case AFR_DATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_DATA; - break; - case AFR_METADATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_METADATA; - break; - case AFR_ENTRY_TRANSACTION: - sh_type = AFR_SELF_HEAL_ENTRY; - break; - case AFR_ENTRY_RENAME_TRANSACTION: - GF_ASSERT (0); - break; - } - return sh_type; + if (list_empty(&priv->heal_waiting)) + goto none; + if ((priv->background_self_heal_count > 0) && + (priv->healers >= priv->background_self_heal_count)) + goto none; + + local = list_entry(priv->heal_waiting.next, afr_local_t, healer); + priv->heal_waiters--; + GF_ASSERT(priv->heal_waiters >= 0); + list_del_init(&local->healer); + list_add(&local->healer, &priv->healing); + priv->healers++; + return local; +none: + gf_msg_debug(THIS->name, 0, + "Nothing dequeued. " + "Num healers: %d, Num Waiters: %d", + priv->healers, priv->heal_waiters); + return NULL; } int -afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +afr_refresh_selfheal_wrap(void *opaque) { - int ret = -1; - uuid_t pargfid = {0}; - - if (!child) - goto out; - - if (!uuid_is_null (parent->inode->gfid)) - uuid_copy (pargfid, parent->inode->gfid); - else if (!uuid_is_null (parent->gfid)) - uuid_copy (pargfid, parent->gfid); - - if (uuid_is_null (pargfid)) - goto out; - - if (strcmp (parent->path, "/") == 0) - ret = gf_asprintf ((char **)&child->path, "/%s", name); - else - ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, - name); - - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed while setting child path"); - } - - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; + call_frame_t *heal_frame = opaque; + afr_local_t *local = heal_frame->local; + int ret = 0; - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); - uuid_copy (child->pargfid, pargfid); - - if (!child->inode) { - ret = -1; - goto out; - } - - ret = 0; -out: - if ((ret == -1) && child) - loc_wipe (child); - - return ret; + ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid); + return ret; } int -afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, - afr_transaction_type type, afr_fxattrop_cbk_t cbk, - int (*finish)(call_frame_t *frame, xlator_t *this)) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - int ret = -1; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, - sh->success, priv->child_count, type); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - if (!erase_xattr) - goto out; - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - erase_xattr[i] = dict_new (); - if (!erase_xattr[i]) - goto out; - } - } - - afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, - priv->child_count, type); - - gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", - lkowner_utoa (&frame->root->lk_owner)); - afr_sh_print_pending_matrix (sh->delta_matrix, this); - local->call_count = call_count; - if (call_count == 0) { - ret = 0; - finish (frame, this); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction - STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i], - NULL); - } - } - - ret = 0; -out: - if (erase_xattr) { - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - } +afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque) +{ + call_frame_t *heal_frame = opaque; + xlator_t *this = heal_frame->this; + afr_private_t *priv = this->private; + afr_local_t *local = heal_frame->local; - GF_FREE (erase_xattr); + LOCK(&priv->lock); + { + list_del_init(&local->healer); + priv->healers--; + GF_ASSERT(priv->healers >= 0); + local = __afr_dequeue_heals(priv); + } + UNLOCK(&priv->lock); - if (ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - finish (frame, this); - } + AFR_STACK_DESTROY(heal_frame); - return 0; + if (local) + afr_heal_synctask(this, local); + return 0; } void -afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +afr_heal_synctask(xlator_t *this, afr_local_t *local) { - xlator_t *this = NULL; - afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); - afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; - this = THIS; + int ret = 0; + call_frame_t *heal_frame = NULL; - if (!sh) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" - "Structure"); - goto out; - } - - switch (sh_type_in_action) { - case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: - sh_status->gfid_or_missing_entry_self_heal = status; - break; - case AFR_SELF_HEAL_METADATA: - sh_status->metadata_self_heal = status; - break; - case AFR_SELF_HEAL_DATA: - sh_status->data_self_heal = status; - break; - case AFR_SELF_HEAL_ENTRY: - sh_status->entry_self_heal = status; - break; - case AFR_SELF_HEAL_INVALID: - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" - "self heal type in action"); - break; - } -out: - return; + heal_frame = local->heal_frame; + ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_heal_done, heal_frame, heal_frame); + if (ret < 0) + /* Heal not launched. Will be queued when the next inode + * refresh happens and shd hasn't healed it yet. */ + afr_refresh_heal_done(ret, heal_frame, heal_frame); } -void -afr_set_local_for_unhealable (afr_local_t *local) -{ - afr_self_heal_t *sh = NULL; +gf_boolean_t +afr_throttled_selfheal(call_frame_t *frame, xlator_t *this) +{ + gf_boolean_t can_heal = _gf_true; + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + + LOCK(&priv->lock); + { + if ((priv->background_self_heal_count > 0) && + (priv->heal_wait_qlen + priv->background_self_heal_count) > + (priv->heal_waiters + priv->healers)) { + list_add_tail(&local->healer, &priv->heal_waiting); + priv->heal_waiters++; + local = __afr_dequeue_heals(priv); + } else { + can_heal = _gf_false; + } + } + UNLOCK(&priv->lock); - sh = &local->self_heal; + if (can_heal) { + if (local) + afr_heal_synctask(this, local); + else + gf_msg_debug(this->name, 0, + "Max number of heals are " + "pending, background self-heal rejected."); + } - local->unhealable = 1; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + return can_heal; } int -is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) +afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, + afr_transaction_type type) { - afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; - afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; - afr_self_heal_status status = AFR_SELF_HEAL_FAILED; - xlator_t *this = NULL; - int sh_failed = 0; + int source = -1; + int i = 0; - this = THIS; + /* Give preference to local child to save on bandwidth */ + for (i = 0; i < priv->child_count; i++) { + if (priv->local[i] && sources[i]) { + if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i)) + continue; - if (!sh) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " - "structure"); - sh_failed = 1; - goto out; + source = i; + goto out; } + } - if (type == AFR_CHECK_ALL) { - if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) - sh_failed = 1; - } else if (type == AFR_CHECK_SPECIFIC) { - sh_type_in_action = sh->sh_type_in_action; - switch (sh_type_in_action) { - case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: - status = sh_status.gfid_or_missing_entry_self_heal; - break; - case AFR_SELF_HEAL_METADATA: - status = sh_status.metadata_self_heal; - break; - case AFR_SELF_HEAL_ENTRY: - status = sh_status.entry_self_heal; - break; - case AFR_SELF_HEAL_DATA: - status = sh_status.data_self_heal; - break; - case AFR_SELF_HEAL_INVALID: - status = AFR_SELF_HEAL_NOT_ATTEMPTED; - break; - } - if (status == AFR_SELF_HEAL_FAILED) - sh_failed = 1; - + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + goto out; } - + } out: - return sh_failed; -} - -char * -get_sh_completion_status (afr_self_heal_status status) -{ - - char *not_attempted = " is not attempted"; - char *failed = " failed"; - char *started = " is started"; - char *sync_begin = " is successfully completed"; - char *result = " has unknown status"; - - switch (status) - { - case AFR_SELF_HEAL_NOT_ATTEMPTED: - result = not_attempted; - break; - case AFR_SELF_HEAL_FAILED: - result = failed; - break; - case AFR_SELF_HEAL_STARTED: - result = started; - break; - case AFR_SELF_HEAL_SYNC_BEGIN: - result = sync_begin; - break; + return source; +} + +static int +afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret == 0) { + local->op_ret = 0; + local->replies[i].poststat = *buf; + local->replies[i].preparent = *preparent; + local->replies[i].postparent = *postparent; + } + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + unsigned char *mkdir_on = alloca0(priv->child_count); + unsigned char *lookup_on = alloca0(priv->child_count); + loc_t loc = {0}; + int32_t op_errno = 0; + int32_t child_op_errno = 0; + struct iatt iatt = {0}; + dict_t *xdata = NULL; + uuid_t anon_inode_gfid = {0}; + int mkdir_count = 0; + int i = 0; + + /*Try to mkdir everywhere and return success if the dir exists on 'child' + */ + + if (!priv->use_anon_inode) { + op_errno = EINVAL; + goto out; + } + + frame = afr_frame_create(this, &op_errno); + if (op_errno) { + goto out; + } + local = frame->local; + if (!local->child_up[child]) { + /*Other bricks may need mkdir so don't error out yet*/ + child_op_errno = ENOTCONN; + } + gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + if (priv->anon_inode[i]) { + mkdir_on[i] = 0; + } else { + mkdir_on[i] = 1; + mkdir_count++; } + } - return result; + if (mkdir_count == 0) { + *linked_inode = inode_find(this->itable, anon_inode_gfid); + if (*linked_inode) { + op_errno = 0; + goto out; + } + } -} + loc.parent = inode_ref(this->itable->root); + loc.name = priv->anon_inode_name; + loc.inode = inode_new(this->itable); + if (!loc.inode) { + op_errno = ENOMEM; + goto out; + } -void -afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) -{ - - char sh_log[4096] = {0}; - afr_self_heal_t *sh = &local->self_heal; - afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; - xlator_t *this = NULL; - size_t off = 0; - int data_sh = 0; - int metadata_sh = 0; - int print_log = 0; - - this = THIS; - - ADD_FMT_STRING (sh_log, off, "gfid or missing entry", - all_status.gfid_or_missing_entry_self_heal, print_log); - ADD_FMT_STRING_SYNC (sh_log, off, "metadata", - all_status.metadata_self_heal, print_log); - if (sh->background) { - ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", - all_status.data_self_heal, print_log); + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto out; + } + + op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true); + if (op_errno) { + goto out; + } + + if (mkdir_count == 0) { + memcpy(lookup_on, local->child_up, priv->child_count); + goto lookup; + } + + AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0, + xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!mkdir_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno == EEXIST) { + lookup_on[i] = 1; + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } + + if (AFR_COUNT(lookup_on, priv->child_count) == 0) { + goto link; + } + +lookup: + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xdata); + for (i = 0; i < priv->child_count; i++) { + if (!lookup_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + if (gf_uuid_compare(anon_inode_gfid, + local->replies[i].poststat.ia_gfid) == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else { + if (i == child) + child_op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA, + "%s has gfid: %s", priv->anon_inode_name, + uuid_utoa(local->replies[i].poststat.ia_gfid)); + } + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } +link: + if (!gf_uuid_is_null(iatt.ia_gfid)) { + *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt); + if (*linked_inode) { + op_errno = 0; + inode_lookup(*linked_inode); } else { - ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", - all_status.data_self_heal, print_log); + op_errno = ENOMEM; } - ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, - print_log); - - if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && - strcmp (sh->data_sh_info, "") && sh->data_sh_info ) - data_sh = 1; - if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && - strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) - metadata_sh = 1; - - if (!print_log) - return; - - gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, - ((data_sh == 1) ? sh->data_sh_info : ""), - ((metadata_sh == 1) ? sh->metadata_sh_info : ""), - local->loc.path); + goto out; + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + /*child_op_errno takes precedence*/ + if (child_op_errno == 0) { + child_op_errno = op_errno; + } + + if (child_op_errno && *linked_inode) { + inode_unref(*linked_inode); + *linked_inode = NULL; + } + if (frame) + AFR_STACK_DESTROY(frame); + return -child_op_errno; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h deleted file mode 100644 index 4732647760d..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_COMMON_H__ -#define __AFR_SELF_HEAL_COMMON_H__ - -#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) -#define AFR_SH_MIN_PARTICIPANTS 2 - -typedef enum { - AFR_LOOKUP_FAIL_CONFLICTS = 1, - AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, -} afr_lookup_flags_t; - -int -afr_sh_select_source (int sources[], int child_count); - -int -afr_sh_source_count (int sources[], int child_count); - -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); - -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc); - -int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - unsigned char *ignorant_subvols, - dict_t *xattr[], afr_transaction_type type, - size_t child_count); - -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], unsigned char success[], - int child_count, afr_transaction_type type); - -int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, - struct iatt *bufs, afr_self_heal_type type, - int32_t *success_children, int32_t *subvol_status); - -int -afr_sh_delta_to_xattr (xlator_t *this, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); - -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size); - -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type); - -int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type, - int32_t *subvol_status, gf_boolean_t ignore_ignorant); -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); - -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent, - loc_t *loc); - -int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid, - int32_t flags, dict_t *xdata); -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf, - struct iatt *parentbuf); -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *base_name, afr_lock_cbk_t lock_cbk); -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index); -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, - afr_lock_cbk_t lock_cbk); -afr_local_t * -afr_self_heal_local_init (afr_local_t *l, xlator_t *this); -int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, char *dom, - afr_lock_cbk_t success_handler, - afr_lock_cbk_t failure_handler); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno); -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); -typedef int -(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata); -int -afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); -int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, call_frame_t **impunge_frame); -void -afr_sh_reset (call_frame_t *frame, xlator_t *this); - -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, - int *intersection, unsigned int child_count); -int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, - struct iatt *bufs); -int -afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, - afr_transaction_type type, afr_fxattrop_cbk_t cbk, - int (*finish)(call_frame_t *frame, xlator_t *this)); - -void -afr_set_local_for_unhealable (afr_local_t *local); - -int -is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); - -void -afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); - -void -afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); - -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); -#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 9de26ee569c..37bcc2b3f9e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,1747 +8,884 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -int -afr_sh_data_fail (call_frame_t *frame, xlator_t *this); - -static inline gf_boolean_t -afr_sh_data_proceed (unsigned int success_count) -{ - return (success_count >= AFR_SH_MIN_PARTICIPANTS); -} - -extern int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); - -int -afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this); - -int -afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this); - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this); - -int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->completion_cbk (frame, this); - - return 0; -} - - -int -afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "flush failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_done (frame, this); - } - - return 0; -} - -int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (!sh->healing_fd) { - //This happens when file is non-reg - afr_sh_data_done (frame, this); - return 0; - } - call_count = afr_set_elem_count_get (sh->success, - priv->child_count); - local->call_count = call_count; - - if (call_count == 0) { - afr_sh_data_done (frame, this); - return 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i]) - continue; - gf_log (this->name, GF_LOG_DEBUG, - "closing fd of %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - sh->healing_fd, NULL); - - if (!--call_count) - break; - } - - return 0; -} - -int -afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->sh_dom_lock_held) - afr_sh_data_unlock (frame, this, priv->sh_domain, - afr_sh_data_close); - else - afr_sh_data_close (frame, this); - return 0; -} - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) -{ - - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "setattr failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_finish (frame, this); - } - - return 0; -} - -int -afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - call_count = afr_set_elem_count_get (sh->success, - priv->child_count); - local->call_count = call_count; - - if (call_count == 0) { - GF_ASSERT (0); - afr_sh_data_finish (frame, this); - return 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, stbuf, valid, NULL); - - if (!--call_count) - break; - } - - return 0; -} - -int -afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->source == child_index); - if (op_ret != -1) { - sh->buf[child_index] = *buf; - afr_sh_data_setattr (frame, this, buf); - } else { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "time-stamps after self-heal", local->loc.path); - afr_sh_data_fail (frame, this); - } - - return 0; -} - -/* - * If there are any writes after the self-heal is triggered then the - * stbuf stored in local->self_heal.buf[] will be invalid so we do one more - * stat on the source and then set the [am]times - */ -int -afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->fstat, - sh->healing_fd, NULL); - return 0; -} +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include "afr-messages.h" +#include <glusterfs/events.h> -//Fun fact, lock_cbk is being used for both lock & unlock -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, - afr_lock_cbk_t lock_cbk) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int ret = 0; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - priv = this->private; - - if (strcmp (dom, this->name) == 0) { - sh->data_lock_held = _gf_false; - } else if (strcmp (dom, priv->sh_domain) == 0) { - sh->sh_dom_lock_held = _gf_false; +#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) +static int +__checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata) +{ + afr_local_t *local = NULL; + struct afr_reply *replies = NULL; + int i = (long)cookie; + + local = frame->local; + replies = local->replies; + + replies[i].valid = 1; + replies[i].op_ret = op_ret; + replies[i].op_errno = op_errno; + if (xdata) { + replies[i].buf_has_zeroes = dict_get_str_boolean( + xdata, "buf-has-zeroes", _gf_false); + replies[i].fips_mode_rchecksum = dict_get_str_boolean( + xdata, "fips-mode-rchecksum", _gf_false); + } + if (strong) { + if (replies[i].fips_mode_rchecksum) { + memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH); } else { - ret = -1; - goto out; - } - int_lock->lock_cbk = lock_cbk; - int_lock->domain = dom; - afr_unlock (frame, this); - -out: - if (ret) { - int_lock->lock_op_ret = -1; - int_lock->lock_cbk (frame, this); - } - return 0; -} - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "finishing data selfheal of %s", local->loc.path); + memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); + } + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +static gf_boolean_t +__afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, struct iatt *poststat) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + unsigned char *wind_subvols = NULL; + gf_boolean_t checksum_match = _gf_true; + struct afr_reply *replies = NULL; + dict_t *xdata = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + replies = local->replies; + + xdata = dict_new(); + if (!xdata) + goto out; + if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) { + dict_unref(xdata); + goto out; + } + + wind_subvols = alloca0(priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (i == source || healed_sinks[i]) + wind_subvols[i] = 1; + } + + AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size, + xdata); + if (xdata) + dict_unref(xdata); + + if (!replies[source].valid || replies[source].op_ret != 0) + return _gf_false; - if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + if (replies[i].valid) { + if (memcmp(replies[source].checksum, replies[i].checksum, + replies[source].fips_mode_rchecksum + ? SHA256_DIGEST_LENGTH + : MD5_DIGEST_LENGTH)) { + checksum_match = _gf_false; + break; + } + } + } + + if (checksum_match) { + if (HAS_HOLES(poststat)) + return _gf_true; + + /* For non-sparse files, we might be better off writing the + * zeroes to sinks to avoid mismatch of disk-usage in bricks. */ + if (local->replies[source].buf_has_zeroes) + return _gf_false; else - afr_sh_dom_unlock (frame, this); - - return 0; -} - -int -afr_sh_data_fail (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "finishing failed data selfheal of %s", local->loc.path); - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_data_finish (frame, this); - return 0; -} - -int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t child_index = (long) cookie; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " - "log failed on %s for subvol %s, reason: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - if (sh->old_loop_frame) - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - afr_sh_data_fail (frame, this); - goto out; - } - if (!IA_ISREG (sh->type)) { - afr_sh_data_finish (frame, this); - goto out; - } - GF_ASSERT (sh->old_loop_frame); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, - afr_post_sh_big_lock_success, - afr_post_sh_big_lock_failure); - } + return _gf_true; + } out: - return 0; -} - -int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, - afr_sh_data_erase_pending_cbk, - afr_sh_data_finish); - return 0; -} - -int -afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " - "%s - %s", local->loc.path, - priv->children[child_index]->name, strerror (op_errno)); - LOCK (&frame->lock); - { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - UNLOCK (&frame->lock); - if (sh->old_loop_frame) - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - } - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - afr_sh_data_fail (frame, this); - else - afr_sh_data_erase_pending (frame, this); - } - return 0; -} - -/* - * Before erasing xattrs, make sure the data is written to disk - */ -int -afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - call_count = sh->active_sinks; - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - return 0; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i] || sh->sources[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, - sh->healing_fd, 1, NULL); - } - - return 0; -} - -static struct afr_sh_algorithm * -sh_algo_from_name (xlator_t *this, char *name) -{ - int i = 0; - - if (name == NULL) - goto out; - - while (afr_self_heal_algorithms[i].name) { - if (!strcmp (name, afr_self_heal_algorithms[i].name)) { - return &afr_self_heal_algorithms[i]; - } - - i++; - } - + return _gf_false; +} + +static gf_boolean_t +__afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset, + int sink) +{ + afr_private_t *priv = NULL; + struct iobref *iobref = NULL; + struct iovec *iovec = NULL; + int count = 0; + int ret = 0; + gf_boolean_t zero_filled = _gf_false; + + priv = this->private; + ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec, + &count, &iobref, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = iov_0filled(iovec, count); + if (!ret) + zero_filled = _gf_true; out: - return NULL; + if (iovec) + GF_FREE(iovec); + if (iobref) + iobref_unref(iobref); + return zero_filled; } - static int -sh_zero_byte_files_exist (afr_local_t *local, int child_count) -{ - int i = 0; - int ret = 0; - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; - for (i = 0; i < child_count; i++) { - if (!local->child_up[i] || sh->child_errno[i]) - continue; - if (sh->buf[i].ia_size == 0) { - ret = 1; - break; - } - } - +__afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct afr_reply *replies, int type) +{ + struct iovec *iovec = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec, + &count, &iobref, NULL, NULL, NULL); + if (ret <= 0) return ret; -} + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + /* + * TODO: Use fiemap() and discard() to heal holes + * in the future. + * + * For now, + * + * - if the source had any holes at all, + * AND + * - if we are writing past the original file size + * of the sink + * AND + * - is NOT the last block of the source file. if + * the block contains EOF, it has to be written + * in order to set the file size even if the + * last block is 0-filled. + * AND + * - if the read buffer is filled with only 0's + * + * then, skip writing to this source. We don't depend + * on the write to happen to update the size as we + * have performed an ftruncate() upfront anyways. + */ +#define is_last_block(o, b, s) ((s >= o) && (s <= (o + b))) + if (HAS_HOLES((&replies[source].poststat)) && + offset >= replies[i].poststat.ia_size && + !is_last_block(offset, size, replies[source].poststat.ia_size) && + (iov_0filled(iovec, count) == 0)) + continue; + + /* Avoid filling up sparse regions of the sink with 0-filled + * writes.*/ + if (type == AFR_SELFHEAL_DATA_FULL && + HAS_HOLES((&replies[source].poststat)) && + ((offset + size) <= replies[i].poststat.ia_size) && + (iov_0filled(iovec, count) == 0) && + __afr_is_sink_zero_filled(this, fd, size, offset, i)) { + continue; + } + + ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref, + 0, NULL, NULL, NULL, NULL); + if (ret != iov_length(iovec, count)) { + /* write() failed on this sink. unset the corresponding + member in sinks[] (which is healed_sinks[] in the + caller) so that this server does NOT get considered + as successfully healed. + */ + healed_sinks[i] = 0; + } + } + if (iovec) + GF_FREE(iovec); + if (iobref) + iobref_unref(iobref); + + return ret; +} + +static gf_boolean_t +afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source, + unsigned char *healed_sinks) +{ + afr_private_t *priv = this->private; + int i = 0; + + if (!locked_on[source]) + return _gf_false; -struct afr_sh_algorithm * -afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - struct afr_sh_algorithm * algo = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - algo = sh_algo_from_name (this, priv->data_self_heal_algorithm); - - if (algo == NULL) { - /* option not set, so fall back on heuristics */ - - if (sh_zero_byte_files_exist (local, priv->child_count) - || (sh->file_size <= (priv->data_self_heal_window_size * - this->ctx->page_size))) { - - /* - * If the file does not exist on one of the subvolumes, - * or a zero-byte file exists (created by entry self-heal) - * the entire content has to be copied anyway, so there - * is no benefit from using the "diff" algorithm. - * - * If the file size is about the same as page size, - * the entire file can be read and written with a few - * (pipelined) STACK_WINDs, which will be faster - * than "diff" which has to read checksums and then - * read and write. - */ - - algo = sh_algo_from_name (this, "full"); - - } else { - algo = sh_algo_from_name (this, "diff"); - } - } + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && locked_on[i]) + return _gf_true; + } - return algo; + return _gf_false; } - -int -afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, off_t offset, + size_t size, int type, struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - struct afr_sh_algorithm *sh_algo = NULL; + int ret = -1; + afr_private_t *priv = NULL; + unsigned char *data_lock = NULL; - local = frame->local; - sh = &local->self_heal; - - sh->algo_completion_cbk = afr_sh_data_fsync; - sh->algo_abort_cbk = afr_sh_data_fail; - - sh_algo = afr_sh_data_pick_algo (frame, this); - - sh->algo = sh_algo; - sh_algo->fn (frame, this); - - return 0; -} + priv = this->private; + data_lock = alloca0(priv->child_count); -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - int call_count = 0; - int child_index = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + { + if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) { + ret = -ENOTCONN; + goto unlock; } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - afr_sh_data_fail (frame, this); - else - afr_sh_data_sync_prepare (frame, this); + if (type == AFR_SELFHEAL_DATA_DIFF && + __afr_can_skip_data_block_heal(frame, this, fd, source, + healed_sinks, offset, size, + &replies[source].poststat)) { + ret = 0; + goto unlock; } - return 0; + ret = __afr_selfheal_data_read_write( + frame, this, fd, source, healed_sinks, offset, size, replies, type); + } +unlock: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + return ret; } - -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; - + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - priv = this->private; - local = frame->local; - sh = &local->self_heal; + local = frame->local; + priv = this->private; - sources = sh->sources; - call_count = sh->active_sinks; - - local->call_count = call_count; + if (!priv->ensure_durability) + return 0; - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size, - NULL); - - if (!--call_count) - break; - } + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL); - return 0; + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret != 0) + /* fsync() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } -int -afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) +static int +afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks, + int source, struct afr_reply *replies) { - afr_private_t *priv = NULL; - int ret = 0; - int i = 0; - - priv = this->private; - sh->source = afr_sh_select_source (sh->sources, priv->child_count); - if (sh->source < 0) { - ret = -1; - goto out; - } + int type = AFR_SELFHEAL_DATA_FULL; + int i = 0; - /* detect changes not visible through pending flags -- JIC */ + if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) { + type = AFR_SELFHEAL_DATA_FULL; for (i = 0; i < priv->child_count; i++) { - if (i == sh->source || sh->child_errno[i]) - continue; - - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source])) - sh->sources[i] = 0; + if (!healed_sinks[i] && i != source) + continue; + if (replies[i].poststat.ia_size) { + type = AFR_SELFHEAL_DATA_DIFF; + break; + } } - - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); -out: - return ret; + } else { + type = priv->data_self_heal_algorithm; + } + return type; } -char* -afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) +static int +afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + unsigned char *healed_sinks, struct afr_reply *replies) { - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *sizes_str = NULL; - size_t off = 0; - char *fmt_str = "%llu bytes on %s, "; - char *child_down = " %s,"; - char *child_unknown = " %s,"; - int down_child_present = 0; - int down_count = 0; - int unknown_count = 0; - int unknown_child_present = 0; - char *down_subvol_1 = " down subvolume is "; - char *unknown_subvol_1 = " unknown subvolume is "; - char *down_subvol_2 = " down subvolumes are "; - char *unknown_subvol_2 = " unknown subvolumes are "; - - priv = this->private; + afr_private_t *priv = NULL; + off_t off = 0; + size_t block = 0; + int type = AFR_SELFHEAL_DATA_FULL; + int ret = -1; + call_frame_t *iter_frame = NULL; + unsigned char arbiter_sink_status = 0; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 1) { - len += snprintf (num, sizeof (num), fmt_str, - (unsigned long long) bufs[i].ia_size, - priv->children[i]->name); - } else if (local->child_up[i] == 0) { - len += snprintf (num, sizeof (num), child_down, - priv->children[i]->name); - if (!down_child_present) - down_child_present = 1; - down_count ++; - } else if (local->child_up[i] == -1) { - len += snprintf (num, sizeof (num), child_unknown, - priv->children[i]->name); - if (!unknown_child_present) - unknown_child_present = 1; - unknown_count++; - } - - } - - if (down_child_present) { - if (down_count > 1) - len += snprintf (num, sizeof (num), "%s", - down_subvol_2); - else - len += snprintf (num, sizeof (num), "%s", - down_subvol_1); - } - if (unknown_child_present) { - if (unknown_count > 1) - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_2); - else - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_1); - } - - len++;//for '\0' - - sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - - if (!sizes_str) - return NULL; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 1) { - off += snprintf (sizes_str + off, len - off, fmt_str, - (unsigned long long) bufs[i].ia_size, - priv->children[i]->name); - } - } + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing data selfheal on %s", uuid_utoa(fd->inode->gfid)); - if (down_child_present) { - if (down_count > 1) { - off += snprintf (sizes_str + off, len - off, "%s", - down_subvol_2); - } else { - off += snprintf (sizes_str + off, len - off, "%s", - down_subvol_1); - } - } + priv = this->private; + if (priv->arbiter_count) { + arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX]; + healed_sinks[ARBITER_BRICK_INDEX] = 0; + } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 0) { - off += snprintf (sizes_str + off, len - off, child_down, - priv->children[i]->name); - } - } + block = 128 * 1024 * priv->data_self_heal_window_size; - if (unknown_child_present) { - if (unknown_count > 1) { - off += snprintf (sizes_str + off, len - off, "%s", - unknown_subvol_2); - } else { - off += snprintf (sizes_str + off, len - off, "%s", - unknown_subvol_1); - } - } + type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == -1) { - off += snprintf (sizes_str + off, len - off, - child_unknown, - priv->children[i]->name); + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + ret = -ENOMEM; + goto out; + } - } + for (off = 0; off < replies[source].poststat.ia_size; off += block) { + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + ret = -ENOTCONN; + goto out; } - return sizes_str; -} + ret = afr_selfheal_data_block(iter_frame, this, fd, source, + healed_sinks, off, block, type, replies); + if (ret < 0) + goto out; -char* -afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) -{ - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *sinks_str = NULL; - char *temp_str = " to sinks "; - char *str_format = " %s,"; - char off = 0; - - priv = this->private; - - len += snprintf (num, sizeof (num), "%s", temp_str); - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { - len += snprintf (num, sizeof (num), str_format, - priv->children[i]->name); - } + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = -ENOTCONN; + goto out; } + } - len ++; - - sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - - if (!sinks_str) - return NULL; + ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks); - off += snprintf (sinks_str + off, len - off, "%s", temp_str); - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { - off += snprintf (sinks_str + off, len - off, - str_format, - priv->children[i]->name); - } - } - - return sinks_str; +out: + if (arbiter_sink_status) + healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status; + if (iter_frame) + AFR_STACK_DESTROY(iter_frame); + return ret; } - -void -afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) +static int +__afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks, uint64_t size) { - char *pending_matrix_str = NULL; - char *sizes_str = NULL; - char *sinks_str = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - - pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, - this); - if (!pending_matrix_str) - pending_matrix_str = ""; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - sizes_str = afr_get_sizes_str (local, sh->buf, this); - if (!sizes_str) - sizes_str = ""; + local = frame->local; + priv = this->private; - sinks_str = afr_get_sinks_str (this, local, sh); - if (!sinks_str) - sinks_str = ""; + /* This will send truncate on the arbiter brick as well if it is marked as + * sink. If changelog is enabled on the volume it captures truncate as a + * data transactions on the arbiter brick. This will help geo-rep to + * properly sync the data from master to slave if arbiter is the ACTIVE + * brick during syncing and which had got some entries healed for data as + * part of self heal. + */ + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size, + NULL); - gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " - "%s data %s", priv->children[sh->source]->name, sinks_str, - sizes_str, pending_matrix_str); + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret == -1) + /* truncate() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; - if (pending_matrix_str && strcmp (pending_matrix_str, "")) - GF_FREE (pending_matrix_str); - - if (sizes_str && strcmp (sizes_str, "")) - GF_FREE (sizes_str); + return 0; } -void -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_has_source_witnesses(xlator_t *this, unsigned char *sources, + uint64_t *witness) { - int source = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - sh->block_size = this->ctx->page_size; - sh->file_size = sh->buf[source].ia_size; - - if (FILE_HAS_HOLES (&sh->buf[source])) - sh->file_has_holes = 1; - - if (sh->background && sh->unwind && !sh->unwound) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); - sh->unwound = _gf_true; - } - - afr_sh_mark_source_sinks (frame, this); - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_INFO, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return; - } + int i = 0; + afr_private_t *priv = NULL; - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[sh->source]->name, - sh->active_sinks); + priv = this->private; - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_sh_data_trim_sinks (frame, this); + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && witness[i]) + return _gf_true; + } + return _gf_false; } -int -afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +afr_does_size_mismatch(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int ret = 0; - int *old_sources = NULL; - int tstamp_source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", - lkowner_utoa (&frame->root->lk_owner)); - if (sh->sync_done) { - //store sources before sync so that mtime can be set using the - //iatt buf from one of them. - old_sources = alloca (priv->child_count*sizeof (*old_sources)); - memcpy (old_sources, sh->sources, - priv->child_count * sizeof (*old_sources)); - } + int i = 0; + afr_private_t *priv = NULL; + struct iatt *min = NULL; + struct iatt *max = NULL; - nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, - sh->sources, sh->success_children, - AFR_DATA_TRANSACTION, NULL, _gf_true); - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { + priv = this->private; - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to " - "resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - sh->sources[priv->favorite_child] = 1; + if (replies[i].op_ret < 0) + continue; - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } + if (!sources[i]) + continue; - if (nsources == -1) { - afr_sh_print_split_brain_log (sh->pending_matrix, this, - local->loc.path); - afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); + if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0)) + continue; - afr_sh_data_fail (frame, this); - return 0; - } + if (!min) + min = &replies[i].poststat; - afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); + if (!max) + max = &replies[i].poststat; - ret = afr_sh_inode_set_read_ctx (sh, this); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); + if (min->ia_size > replies[i].poststat.ia_size) + min = &replies[i].poststat; - afr_sh_data_fail (frame, this); - return 0; - } + if (max->ia_size < replies[i].poststat.ia_size) + max = &replies[i].poststat; + } - if (sh->sync_done) { - /* Perform setattr from one of the old_sources if possible - * Because only they have the correct mtime, the new sources - * (i.e. old sinks) have mtime from last writev in sync. - */ - tstamp_source = sh->source; - for (i = 0; i < priv->child_count; i++) { - if (old_sources[i] && sh->sources[i]) - tstamp_source = i; - } - afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); - } else { - afr_set_data_sh_info_str (local, sh, this); - if (nsources == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_data_finish (frame, this); - return 0; - } - - if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) - afr_sh_data_fix (frame, this); - else - afr_sh_data_finish (frame, this); - } - return 0; -} + if (min && max) { + if (min->ia_size != max->ia_size) + return _gf_true; + } -int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, - dict_t **xattr, - afr_transaction_type txn_type, - uuid_t gfid) -{ - afr_private_t *priv = NULL; - int read_child = -1; - int32_t **pending_matrix = NULL; - int32_t *sources = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int32_t nsources = 0; - int32_t prev_read_child = -1; - int32_t config_read_child = -1; - int32_t subvol_status = 0; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - - pending_matrix = local->cont.lookup.pending_matrix; - sources = local->cont.lookup.sources; - memset (sources, 0, sizeof (*sources) * priv->child_count); - - nsources = afr_build_sources (this, xattr, bufs, pending_matrix, - sources, success_children, txn_type, - &subvol_status, _gf_false); - if (subvol_status & SPLIT_BRAIN) { - gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain", - local->loc.path); - switch (txn_type) { - case AFR_DATA_TRANSACTION: - local->cont.lookup.possible_spb = _gf_true; - nsources = 1; - sources[success_children[0]] = 1; - break; - case AFR_ENTRY_TRANSACTION: - read_child = afr_get_no_xattr_dir_read_child (this, - success_children, - bufs); - sources[read_child] = 1; - nsources = 1; - break; - default: - break; - } - } - if (nsources < 0) - goto out; - - prev_read_child = local->read_child_index; - config_read_child = priv->read_child; - read_child = afr_select_read_child_from_policy (success_children, - priv->child_count, - prev_read_child, - config_read_child, - sources, - priv->hash_mode, gfid); -out: - gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", - read_child); - return read_child; + return _gf_false; } -int -afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) +static void +afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources, + uint64_t *witness) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fstat of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->buf[child_index] = *buf; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } else { - gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " - "on %s, reason %s", local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - /* Previous versions of glusterfs might have set - * the pending data xattrs which need to be erased - */ - if (!afr_sh_data_proceed (sh->success_count)) { - gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " - "succeeded on < %d children, aborting " - "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, - local->loc.path); - afr_sh_data_fail (frame, this); - goto out; - } - afr_sh_data_fxattrop_fstat_done (frame, this); - } -out: - return 0; -} + int i = 0; + afr_private_t *priv = NULL; + uint64_t biggest_witness = 0; + priv = this->private; + /* Find source with biggest witness count */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (biggest_witness < witness[i]) + biggest_witness = witness[i]; + } -int -afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - int child = 0; - int32_t *fstat_children = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - fstat_children = memdup (sh->success_children, - sizeof (*fstat_children) * priv->child_count); - if (!fstat_children) { - afr_sh_data_fail (frame, this); - goto out; - } - call_count = sh->success_count; - local->call_count = call_count; + /* Mark files with less witness count as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (witness[i] < biggest_witness) + sources[i] = 0; + } - memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); - afr_reset_children (sh->success_children, priv->child_count); - sh->success_count = 0; - for (i = 0; i < priv->child_count; i++) { - child = fstat_children[i]; - if (child == -1) - break; - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) child, - priv->children[child], - priv->children[child]->fops->fstat, - sh->healing_fd, NULL); - --call_count; - } - GF_ASSERT (!call_count); -out: - GF_FREE (fstat_children); - return 0; -} - -void -afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fxattrop of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->xattr[child_index] = dict_ref (xattr); - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } else { - gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " - "failed on %s, reason %s", local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); + return; } -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) +/* This is a tie breaker function. Only one source be assigned here */ +static void +afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - int call_count = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, - op_errno, xattr); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (!afr_sh_data_proceed (sh->success_count)) { - gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " - "change log succeeded on < %d children", - local->loc.path, AFR_SH_MIN_PARTICIPANTS); - afr_sh_data_fail (frame, this); - goto out; - } - afr_sh_data_fstat (frame, this); - } -out: - return 0; -} + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + uint32_t max_ctime = 0; + priv = this->private; + /* Find source with latest ctime */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; -int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t **xattr_req; - int32_t *zero_pending = NULL; - int call_count = 0; - int i = 0; - int ret = 0; - int j; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (local->child_up, - priv->child_count); - - local->call_count = call_count; - - xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *), - gf_afr_mt_dict_t); - if (!xattr_req) - goto out; - - for (i = 0; i < priv->child_count; i++) { - xattr_req[i] = dict_new(); - if (!xattr_req[i]) { - ret = -1; - goto out; - } - } - - for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - zero_pending = GF_CALLOC (3, sizeof (*zero_pending), - gf_afr_mt_int32_t); - if (!zero_pending) { - ret = -1; - goto out; - } - ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j], - zero_pending, - 3 * sizeof (*zero_pending)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value"); - goto out; - } else { - zero_pending = NULL; - } - } - } - - afr_reset_xattr (sh->xattr, priv->child_count); - afr_reset_children (sh->success_children, priv->child_count); - memset (sh->child_errno, 0, - sizeof (*sh->child_errno) * priv->child_count); - sh->success_count = 0; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req[i], NULL); - - if (!--call_count) - break; - } - } - -out: - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) - if (xattr_req[i]) - dict_unref(xattr_req[i]); - GF_FREE(xattr_req); - } - - if (ret) { - GF_FREE (zero_pending); - afr_sh_data_fail (frame, this); + if (max_ctime <= replies[i].poststat.ia_ctime) { + source = i; + max_ctime = replies[i].poststat.ia_ctime; } + } - return 0; + /* Only mark one of the files as source to break ties */ + memset(sources, 0, sizeof(*sources) * priv->child_count); + sources[source] = 1; } -int -afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; +static int +__afr_selfheal_data_finalize_source( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness) +{ + afr_private_t *priv = NULL; + int source = -1; + int sources_count = 0; + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count) { + /* split brain */ + source = afr_mark_split_brain_source_sinks( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, AFR_DATA_TRANSACTION); + if (source < 0) { + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=data;" + "file=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); + return -EIO; + } + + _afr_fav_child_reset_sink_xattrs( + frame, this, inode, source, healed_sinks, undid_pending, + AFR_DATA_TRANSACTION, locked_on, replies); + goto out; + } + + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + + /* If there are no witnesses/size-mismatches on sources we are done*/ + if (!afr_does_size_mismatch(this, sources, replies) && + !afr_has_source_witnesses(this, sources, witness)) + goto out; + + afr_mark_largest_file_as_source(this, sources, replies); + afr_mark_biggest_witness_as_source(this, sources, witness); + afr_mark_newest_file_as_source(this, sources, replies); + if (priv->arbiter_count) + /* Choose non-arbiter brick as source for empty files. */ + afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks, + locked_on, replies, + AFR_DATA_TRANSACTION); - local = frame->local; - sh = &local->self_heal; +out: + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION); - sh->data_lock_held = _gf_true; - afr_sh_data_fxattrop (frame, this); - return 0; + return source; } +/* + * __afr_selfheal_data_prepare: + * + * This function inspects the on-disk xattrs and determines which subvols + * are sources and sinks. + * + * The return value is the index of the subvolume to be used as the source + * for self-healing, or -1 if no healing is necessary/split brain. + */ int -afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) +__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *pflag) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->sh_dom_lock_held = _gf_true; - afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, - afr_sh_data_big_lock_success, - afr_sh_data_fail); - return 0; -} + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + uint64_t *witness = NULL; -int -afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " - "failed for %s. by %s", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + if (ret) + return ret; - sh->data_lock_failure_handler (frame, this); - } else { + witness = alloca0(priv->child_count * sizeof(*witness)); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_DATA_TRANSACTION, locked_on, sources, + sinks, witness, pflag); + if (ret) + return ret; - gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " - "done for %s by %s. Proceding to self-heal", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); - sh->data_lock_success_handler (frame, this); - } + source = __afr_selfheal_data_finalize_source( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + undid_pending, replies, witness); + if (source < 0) + return -EIO; - return 0; + return source; } -int -afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "failed for %s. by %s", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - - if (!sh->data_lock_block) { - sh->data_lock_failure_handler(frame, this); - } else { - int_lock->lock_cbk = - afr_sh_data_post_blocking_inodelk_cbk; - afr_blocking_lock (frame, this); - } - } else { +static int +__afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + gf_boolean_t did_sh = _gf_true; + gf_boolean_t is_arbiter_the_only_sink = _gf_false; + gf_boolean_t empty_file = _gf_false; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0, + data_lock); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "self-heal as only %d number " + "of subvolumes " + "could be locked", + uuid_utoa(fd->inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_data_prepare(frame, this, fd->inode, data_lock, + sources, sinks, healed_sinks, + undid_pending, locked_replies, NULL); + if (ret < 0) + goto unlock; + + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; + } + + source = ret; + + if (AFR_IS_ARBITER_BRICK(priv, source)) { + empty_file = afr_is_file_empty_on_all_children(priv, + locked_replies); + if (empty_file) + goto restore_time; + + did_sh = _gf_false; + goto unlock; + } + + ret = __afr_selfheal_truncate_sinks( + frame, this, fd, healed_sinks, + locked_replies[source].poststat.ia_size); + if (ret < 0) + goto unlock; + + if (priv->arbiter_count && + AFR_COUNT(healed_sinks, priv->child_count) == 1 && + healed_sinks[ARBITER_BRICK_INDEX]) { + is_arbiter_the_only_sink = _gf_true; + goto restore_time; + } + ret = 0; + } +unlock: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock); + if (ret < 0) + goto out; + + if (!did_sh) + goto out; + + ret = afr_selfheal_data_do(frame, this, fd, source, healed_sinks, + locked_replies); + if (ret) + goto out; +restore_time: + afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks, + locked_replies); + + if (!is_arbiter_the_only_sink && !empty_file) { + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0, + data_lock); + if (ret < priv->child_count) { + ret = -ENOTCONN; + did_sh = _gf_false; + goto skip_undo_pending; + } + } + ret = afr_selfheal_undo_pending( + frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending, + AFR_DATA_TRANSACTION, locked_replies, data_lock); +skip_undo_pending: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock); +out: - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "done for %s by %s. Proceeding to self-heal", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - sh->data_lock_success_handler (frame, this); - } + if (did_sh) + afr_log_selfheal(fd->inode->gfid, this, ret, "data", source, sources, + healed_sinks); + else + ret = 1; - return 0; + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + + return ret; } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, - off_t start, off_t len) +afr_selfheal_data_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; + afr_local_t *local = NULL; + int i = (long)cookie; - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK; + local = frame->local; - afr_set_lock_number (frame, this); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; - - int_lock->domain = dom; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - inodelk->flock.l_start = start; - inodelk->flock.l_len = len; - inodelk->flock.l_type = F_WRLCK; - - afr_nonblocking_inodelk (frame, this); - - return 0; -} + syncbarrier_wake(&local->barrier); -int -afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->old_loop_frame); - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - sh->data_lock_held = _gf_true; - sh->sync_done = _gf_true; - afr_sh_data_fxattrop (frame, this); - return 0; + return 0; } int -afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) +afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + int ret = 0; + fd_t *fd_tmp = NULL; + loc_t loc = { + 0, + }; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; + priv = this->private; - GF_ASSERT (sh->old_loop_frame); - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - afr_sh_set_timestamps (frame, this); - return 0; -} + fd_tmp = fd_create(inode, 0); + if (!fd_tmp) + return -ENOMEM; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); -int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, - char *dom, afr_lock_cbk_t success_handler, - afr_lock_cbk_t failure_handler) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + fd_unref(fd_tmp); + goto out; + } + local = frame->local; - local = frame->local; - sh = &local->self_heal; + AFR_ONLIST(local->child_up, frame, afr_selfheal_data_open_cbk, open, &loc, + O_RDWR | O_LARGEFILE, fd_tmp, NULL); - sh->data_lock_success_handler = success_handler; - sh->data_lock_failure_handler = failure_handler; - sh->data_lock_block = block; - return afr_sh_data_lock_rec (frame, this, dom, start, len); -} + ret = -ENOTCONN; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; -int -afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "open of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); - } + if (local->replies[i].op_ret < 0) { + ret = -local->replies[i].op_errno; + continue; } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_data_fail (frame, this); - return 0; - } + ret = 0; + break; + } - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); + if (ret < 0) { + fd_unref(fd_tmp); + goto out; + } else { + fd_bind(fd_tmp); + } - afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, - afr_sh_dom_lock_success, afr_sh_data_fail); - } - - return 0; + *fd = fd_tmp; +out: + loc_wipe(&loc); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; } - int -afr_sh_data_open (call_frame_t *frame, xlator_t *this) +afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd) { - int i = 0; - int call_count = 0; - fd_t *fd = NULL; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if(!local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - O_RDWR|O_LARGEFILE, fd, NULL); - - if (!--call_count) - break; - } + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + inode_t *inode = fd->inode; - return 0; -} - -void -afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - int i = 0; - - if (op_ret < 0) { - afr_sh_data_fail (frame, this); - return; - } + priv = this->private; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + locked_on = alloca0(priv->child_count); - for (i = 0; i < priv->child_count ; i++) { - if (1 == local->child_up[i]) - sh->success[i] = 1; + ret = afr_selfheal_tie_breaker_inodelk(frame, this, inode, priv->sh_domain, + 0, 0, locked_on); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "self-heal as only %d number of " + "subvolumes could be locked", + uuid_utoa(fd->inode->gfid), ret); + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; } - afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, - afr_sh_data_erase_pending_cbk, - afr_sh_data_finish); -} - -int -afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - sh->data_lock_held = _gf_true; - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_non_reg_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - return 0; -} - -gf_boolean_t -afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) -{ - if (sh->force_confirm_spb) - return _gf_true; - if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) - return _gf_true; - return _gf_false; -} + ret = __afr_selfheal_data(frame, this, fd, locked_on); + } +unlock: + afr_selfheal_uninodelk(frame, this, inode, priv->sh_domain, 0, 0, + locked_on); -int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - int ret = -1; - - local = frame->local; - sh = &local->self_heal; - - sh->sh_type_in_action = AFR_SELF_HEAL_DATA; - - if (afr_can_start_data_self_heal (sh, priv)) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - ret = afr_inodelk_init (&local->internal_lock.inodelk[1], - priv->sh_domain, priv->child_count); - if (ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_data_done (frame, this); - return 0; - } - - if (IA_ISREG (sh->type)) { - afr_sh_data_open (frame, this); - } else { - afr_sh_data_lock (frame, this, 0, 0, _gf_true, - this->name, - afr_sh_non_reg_lock_success, - afr_sh_data_fail); - } - } else { - gf_log (this->name, GF_LOG_TRACE, - "not doing data self heal on %s", - local->loc.path); - afr_sh_data_done (frame, this); - } - - return 0; + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 53491a1d74c..64893f441e3 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,2363 +8,1269 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "inode.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\ - do {\ - _local = _frame->local;\ - _sh = &_local->self_heal;\ - _sh_frame = _sh->sh_frame;\ - _sh_local = _sh_frame->local;\ - _sh_sh = &_sh_local->self_heal;\ - } while (0); - -int -afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, - int child_index); -int -afr_sh_entry_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->completion_cbk (frame, this); - - return 0; -} - - -int -afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_entry_done; - afr_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "finishing entry selfheal of %s", local->loc.path); - - afr_sh_entry_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - long i = 0; - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *orig_local = NULL; - call_frame_t *orig_frame = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - - afr_children_add_child (sh->fresh_children, i, priv->child_count); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to erase pending xattrs on %s (%s)", - local->loc.path, priv->children[i]->name, - strerror (op_errno)); - } - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->source == -1) { - //this happens if the forced merge option is set - read_child = sh->fresh_children[0]; - } else { - read_child = sh->source; - } - afr_inode_set_read_ctx (this, sh->inode, read_child, - sh->fresh_children); - orig_frame = sh->orig_frame; - orig_local = orig_frame->local; - - if (sh->source != -1) { - orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink; - } - - afr_sh_entry_finish (frame, this); - } - - return 0; -} - +#include <glusterfs/byte-order.h> +#include "afr-transaction.h" +#include "afr-messages.h" +#include <glusterfs/syncop-utils.h> +#include <glusterfs/events.h> int -afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) +afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, + struct afr_reply *replies, + gf_boolean_t *anon_inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t *subvol = NULL; + int ret = 0; + int i = 0; + char g[64] = {0}; + unsigned char *lookup_success = NULL; + call_frame_t *frame = NULL; + loc_t loc2 = { + 0, + }; + loc_t loc = { + 0, + }; + + priv = this->private; + subvol = priv->children[child]; + lookup_success = alloca0(priv->child_count); + uuid_utoa_r(replies[child].poststat.ia_gfid, g); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + if (replies[child].poststat.ia_type == IA_IFDIR) { + /* This directory may have sub-directory hierarchy which may need to + * be preserved for subsequent heals. So unconditionally move the + * directory to anonymous-inode directory*/ + *anon_inode = _gf_true; + goto anon_inode; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid); + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + lookup_success[i] = 1; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + ret = -local->replies[i].op_errno; + } + } + + if (priv->quorum_count) { + if (afr_has_quorum(lookup_success, this, NULL)) { + *anon_inode = _gf_true; + } + } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) { + *anon_inode = _gf_true; + } else if (ret) { + goto out; + } + +anon_inode: + if (!*anon_inode) { + ret = 0; + goto out; + } + + loc.parent = inode_ref(dir); + gf_uuid_copy(loc.pargfid, dir->gfid); + loc.name = name; + + ret = afr_anon_inode_create(this, child, &loc2.parent); + if (ret < 0) + goto out; + + loc2.name = g; + ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s failed", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s successful", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); + } - if (sh->entries_skipped) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - goto out; - } - afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, - afr_sh_entry_erase_pending_cbk, - afr_sh_entry_finish); - return 0; out: - afr_sh_entry_finish (frame, this); - return 0; -} - - - -static int -next_active_source (call_frame_t *frame, xlator_t *this, - int current_active_source) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int source = -1; - int next_active_source = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - source = sh->source; - - if (source != -1) { - if (current_active_source != source) - next_active_source = source; - goto out; - } - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_source)) { + loc_wipe(&loc); + loc_wipe(&loc2); + if (frame) { + AFR_STACK_DESTROY(frame); + } - next_active_source = i; - break; - } - } -out: - return next_active_source; + return ret; } - - -static int -next_active_sink (call_frame_t *frame, xlator_t *this, - int current_active_sink) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int next_active_sink = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_sink)) { - - next_active_sink = i; - break; - } - } - - return next_active_sink; -} - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int -afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src, int32_t op_ret, - int32_t op_errno) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_expunge_subvol (frame, this, active_src); - - return 0; -} - -int -afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = (long) cookie; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "setattr on parent directory of %s on subvolume %s failed: %s", - expunge_local->loc.path, - priv->children[active_src]->name, strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - int -afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int32_t valid = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - - active_src = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "removed %s on %s", - expunge_local->loc.path, - priv->children[active_src]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "removing %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } - - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->setattr, - &expunge_sh->parent_loc, - &expunge_sh->parentbuf, - valid, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "expunging file %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->unlink, - &expunge_local->loc, 0, NULL); - - return 0; -} - - - -int -afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "expunging directory %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->rmdir, - &expunge_local->loc, 1, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf, - struct iatt *parentbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int type = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - loc_t *loc = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - loc = &expunge_local->loc; - - type = buf->ia_type; - if (loc->parent && uuid_is_null (loc->parent->gfid)) - uuid_copy (loc->pargfid, parentbuf->ia_gfid); - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); - break; + char g[64] = {0}; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + gf_boolean_t anon_inode = _gf_false; + + priv = this->private; + subvol = priv->children[child]; + + if ((!replies[child].valid) || (replies[child].op_ret < 0)) { + /*Nothing to do*/ + ret = 0; + goto out; + } + + if (priv->use_anon_inode) { + ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child, + replies, &anon_inode); + if (ret < 0 || anon_inode) + goto out; + } + + loc.parent = inode_ref(dir); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + loc.name = name; + switch (replies[child].poststat.ia_type) { case IA_IFDIR: - afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); - break; + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name, + uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); + break; default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - expunge_local->loc.path, - priv->children[active_src]->name, type); - goto out; - break; - } + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), + name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink(subvol, &loc, NULL, NULL); + break; + } - return 0; out: - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, -1, EINVAL); - - return 0; + loc_wipe(&loc); + return ret; } - int -afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) +afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, + unsigned char *sources, inode_t *dir, + const char *name, inode_t *inode, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = (long) cookie; - local = frame->local; - sh = &local->self_heal; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "lookup of %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); + int ret = 0; + loc_t loc = { + 0, + }; + loc_t srcloc = { + 0, + }; + loc_t anonloc = { + 0, + }; + xlator_t *this = frame->this; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + struct iatt *iatt = NULL; + char *linkname = NULL; + mode_t mode = 0; + struct iatt newent = { + 0, + }; + unsigned char *newentry = NULL; + char iatt_uuid_str[64] = {0}; + char dir_uuid_str[64] = {0}; + + priv = this->private; + iatt = &replies[source].poststat; + uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str); + if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED, + "Invalid ia_type (%d) or gfid(%s). source brick=%d, " + "pargfid=%s, name=%s", + iatt->ia_type, iatt_uuid_str, source, + uuid_utoa_r(dir->gfid, dir_uuid_str), name); + ret = -EINVAL; + goto out; + } + + xdata = dict_new(); + if (!xdata) + return -ENOMEM; + newentry = alloca0(priv->child_count); + loc.parent = inode_ref(dir); + gf_uuid_copy(loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref(inode); + + ret = afr_selfheal_entry_delete(this, dir, name, inode, dst, replies); + if (ret) + goto out; + + ret = dict_set_gfuuid(xdata, "gfid-req", replies[source].poststat.ia_gfid, + true); + if (ret) + goto out; + + srcloc.inode = inode_ref(inode); + gf_uuid_copy(srcloc.gfid, iatt->ia_gfid); + ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == -ENOENT || ret == -ESTALE) { + newentry[dst] = 1; + ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies, + sources, newentry); + if (ret) + goto out; + } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) { + // Try rename from hidden directory + ret = afr_anon_inode_create(this, dst, &anonloc.parent); + if (ret < 0) + goto out; + anonloc.inode = inode_ref(inode); + anonloc.name = iatt_uuid_str; + ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL); + if (ret == -ENOENT || ret == -ESTALE) + ret = -1; /*This sets 'mismatch' to true*/ + goto out; + } + + mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type); + + switch (iatt->ia_type) { + case IA_IFDIR: + ret = syncop_mkdir(priv->children[dst], &loc, mode, 0, xdata, NULL); + break; + case IA_IFLNK: + if (!newentry[dst]) { + ret = syncop_link(priv->children[dst], &srcloc, &loc, &newent, + NULL, NULL); + } else { + ret = syncop_readlink(priv->children[source], &srcloc, + &linkname, 4096, NULL, NULL); + if (ret <= 0) + goto out; + ret = syncop_symlink(priv->children[dst], &loc, linkname, NULL, + xdata, NULL); + } + break; + default: + ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) goto out; - } - - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf, - postparent); + ret = syncop_mknod( + priv->children[dst], &loc, mode, + makedev(ia_major(iatt->ia_rdev), ia_minor(iatt->ia_rdev)), + &newent, xdata, NULL); + break; + } - return 0; out: - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; + if (xdata) + dict_unref(xdata); + GF_FREE(linkname); + loc_wipe(&loc); + loc_wipe(&srcloc); + loc_wipe(&anonloc); + return ret; } - -int -afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &expunge_local->loc, NULL); - - return 0; -} - -int -afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) +static int +__afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int active_src = 0; - int need_expunge = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = expunge_sh->active_source; - source = (long) cookie; - local = frame->local; - sh = &local->self_heal; - - if (op_ret == -1 && op_errno == ENOENT) - need_expunge = 1; - else if (op_ret == -1) - goto out; - - if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) && - !uuid_is_null (buf->ia_gfid) && - (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) { - char uuidbuf1[64]; - char uuidbuf2[64]; - gf_log (this->name, GF_LOG_DEBUG, - "entry %s found on %s with mismatching gfid (%s/%s)", - expunge_local->loc.path, - priv->children[source]->name, - uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1), - uuid_utoa_r (buf->ia_gfid, uuidbuf2)); - need_expunge = 1; - } - - if (need_expunge) { - gf_log (this->name, GF_LOG_INFO, - "Entry %s is missing on %s and deleting from " - "replica's other bricks", - expunge_local->loc.path, - priv->children[source]->name); + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; - if (postparent) - expunge_sh->parentbuf = *postparent; + priv = this->private; - afr_sh_entry_expunge_purge (expunge_frame, this, active_src); + if (!replies[source].valid) + return -EIO; - return 0; - } + /* Skip healing this entry if the last lookup on it failed for reasons + * other than ENOENT. + */ + if ((replies[source].op_ret < 0) && (replies[source].op_errno != ENOENT)) + return -replies[source].op_errno; -out: - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - expunge_local->loc.path, - priv->children[source]->name); + if (replies[source].op_ret == 0) { + ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies, + source, sources, + &replies[source].poststat.ia_gfid, NULL); + if (ret) + return ret; + } + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + if (replies[source].op_ret == -1 && + replies[source].op_errno == ENOENT) { + ret = afr_selfheal_entry_delete(this, fd->inode, name, inode, i, + replies); } else { - gf_log (this->name, GF_LOG_INFO, - "looking up %s under %s failed (%s)", - expunge_local->loc.path, - priv->children[source]->name, - strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - -static gf_boolean_t -can_skip_entry_self_heal (char *name, loc_t *parent_loc) -{ - if (strcmp (name, ".") == 0) { - return _gf_true; - } else if (strcmp (name, "..") == 0) { - return _gf_true; - } else if (loc_is_root (parent_loc) && - (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { - return _gf_true; - } - return _gf_false; -} - -int -afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *expunge_frame = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int source = 0; - int op_errno = 0; - char *name = NULL; - int op_ret = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - source = sh->source; - sh->expunge_done = afr_sh_entry_expunge_entry_done; - - name = entry->d_name; - if (can_skip_entry_self_heal (name, &local->loc)) { - op_ret = 0; - goto out; - } + if (!gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[source].poststat.ia_gfid)) + continue; - gf_log (this->name, GF_LOG_TRACE, - "inspecting existence of %s under %s", - name, local->loc.path); - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - expunge_sh->active_source = active_src; - expunge_sh->entrybuf = entry->d_stat; - loc_copy (&expunge_sh->parent_loc, &local->loc); - - ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, - name); - if (ret != 0) { - op_errno = EINVAL; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", expunge_local->loc.path, - priv->children[source]->name); - - STACK_WIND_COOKIE (expunge_frame, - afr_sh_entry_expunge_entry_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->lookup, - &expunge_local->loc, NULL); - - ret = 0; -out: - if (ret == -1) - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_expunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_expunge_entry (frame, this, entry); + ret = afr_selfheal_recreate_entry(frame, i, source, sources, + fd->inode, name, inode, replies); } + if (ret < 0) + break; + } - return 0; + return ret; } -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->offset = 0; - - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s to expunge entries", - local->loc.path); - goto out; - } - - active_src = next_active_sink (frame, this, sh->active_source); - sh->active_source = active_src; - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - goto out; - } - - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "expunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); - - afr_sh_entry_expunge_subvol (frame, this, active_src); - - return 0; -out: - afr_sh_entry_impunge_all (frame, this); - return 0; - -} - - -int -afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static int +afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this, + struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, + char *bname, int src_idx, + unsigned char *locked_on, int *src) { - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) - sh->entries_skipped = _gf_true; - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this); - - return 0; + int i = 0; + int ret = -1; + afr_private_t *priv = NULL; + void *gfid = NULL; + ia_type_t ia_type = IA_INVAL; + + priv = this->private; + gfid = &replies[src_idx].poststat.ia_gfid; + ia_type = replies[src_idx].poststat.ia_type; + + for (i = 0; i < priv->child_count; i++) { + if (i == src_idx) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + if (gf_uuid_is_null(replies[i].poststat.ia_gfid)) + continue; + + if (replies[i].poststat.ia_type == IA_INVAL) + continue; + + if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) { + src_idx = i; + ia_type = replies[src_idx].poststat.ia_type; + gfid = &replies[src_idx].poststat.ia_gfid; + continue; + } + + if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) && + (ia_type == replies[i].poststat.ia_type)) { + ret = afr_gfid_split_brain_source(this, replies, inode, pargfid, + bname, src_idx, i, locked_on, src, + NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Skipping conservative merge on the " + "file."); + return ret; + } + + if (ia_type != replies[i].poststat.ia_type) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Type mismatch detected " + "for <gfid:%s>/%s>, %s on %s and %s on %s. " + "Skipping conservative merge on the file.", + uuid_utoa(pargfid), bname, + gf_inode_type_to_str(replies[i].poststat.ia_type), + priv->children[i]->name, + gf_inode_type_to_str(replies[src_idx].poststat.ia_type), + priv->children[src_idx]->name); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=file;" + "file=<gfid:%s>/%s>;count=2;child-%d=%s;type-" + "%d=%s;child-%d=%s;type-%d=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(pargfid), bname, i, priv->children[i]->name, i, + gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx, + priv->children[src_idx]->name, src_idx, + gf_inode_type_to_str(replies[src_idx].poststat.ia_type)); + return -1; + } + } + + return 0; } -void -afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static int +__afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, unsigned char *sources, + unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; + int ret = 0; + int i = 0; + int source = -1; + int src = -1; + afr_private_t *priv = NULL; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, op_ret, op_errno); -} + priv = this->private; -int -afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) -{ - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - int child_index = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - child_index = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setattr done for %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "setattr (%s) on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) { - afr_sh_entry_call_impunge_done (impunge_frame, this, - 0, op_errno); + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + source = i; + break; } + } + if (source == -1) { + /* entry got deleted in the mean time? */ return 0; -} - -int -afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) -{ - int call_count = 0; - afr_local_t *setattr_local = NULL; - - setattr_local = setattr_frame->local; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "setattr on parent directory (%s) failed: %s", - setattr_local->loc.path, strerror (op_errno)); - } - - call_count = afr_frame_return (setattr_frame); - if (call_count == 0) - AFR_STACK_DESTROY (setattr_frame); - return 0; -} - -int -afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *setattr_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *setattr_frame = NULL; - int32_t valid = 0; - int32_t op_errno = 0; - int child_index = 0; - int call_count = 0; - int i = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_sh->entrybuf.ia_uid, - impunge_sh->entrybuf.ia_gid); - - setattr_frame = copy_frame (impunge_frame); - if (!setattr_frame) { - op_errno = ENOMEM; - goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out); - setattr_local = setattr_frame->local; - call_count = afr_errno_count (NULL, impunge_sh->child_errno, - priv->child_count, 0); - loc_copy (&setattr_local->loc, &impunge_sh->parent_loc); - impunge_local->call_count = call_count; - setattr_local->call_count = call_count; + } + + /* Set all the sources as 1, otheriwse newentry_mark won't be set */ + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + sources[i] = 1; + } + } + + ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies, + source, sources, + &replies[source].poststat.ia_gfid, NULL); + if (ret) + return ret; + + /* In case of type mismatch / unable to resolve gfid mismatch on the + * entry, return -1.*/ + ret = afr_selfheal_detect_gfid_and_type_mismatch( + this, replies, inode, fd->inode->gfid, name, source, locked_on, &src); + + if (ret < 0) + return ret; + if (src != -1) { + source = src; for (i = 0; i < priv->child_count; i++) { - if (impunge_sh->child_errno[i]) - continue; - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - STACK_WIND_COOKIE (setattr_frame, - afr_sh_entry_impunge_parent_setattr_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->setattr, - &setattr_local->loc, - &impunge_sh->parentbuf, valid, NULL); - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_setattr_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->setattr, - &impunge_local->loc, - &impunge_sh->entrybuf, valid, NULL); - call_count--; + if (i != src && replies[i].valid && + gf_uuid_compare(replies[src].poststat.ia_gfid, + replies[i].poststat.ia_gfid)) { + sources[i] = 0; + } } - GF_ASSERT (!call_count); - return 0; -out: - if (setattr_frame) - AFR_STACK_DESTROY (setattr_frame); - afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno); - return 0; -} + } -int -afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - int child_index = 0; + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; - priv = this->private; - impunge_local = impunge_frame->local; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to perform xattrop on %s (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; + if (src != -1) { + if (!gf_uuid_compare(replies[src].poststat.ia_gfid, + replies[i].poststat.ia_gfid)) + continue; + } else if (replies[i].op_errno != ENOENT) { + continue; } - afr_sh_entry_impunge_setattr (impunge_frame, this); - return 0; -out: - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, op_errno); - return 0; -} - -int -afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, - xlator_t *this) -{ - int active_src = 0; - dict_t *xattr = NULL; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t op_errno = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - - afr_prepare_new_entry_pending_matrix (impunge_local->pending, - afr_is_errno_unset, - impunge_sh->child_errno, - &impunge_sh->entrybuf, - priv->child_count); - xattr = dict_new (); - if (!xattr) { - op_errno = ENOMEM; - goto out; - } - - afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, - LOCAL_LAST); + ret |= afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode, + name, inode, replies); + } - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->xattrop, - &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr, NULL); - - if (xattr) - dict_unref (xattr); - return 0; -out: - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, op_errno); - return 0; + return ret; } -int -afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +static int +__afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - if (op_ret == -1) { - impunge_sh->child_errno[child_index] = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "creation of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } else { - impunge_sh->child_errno[child_index] = 0; - } - - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) { - if (!afr_errno_count (NULL, impunge_sh->child_errno, - priv->child_count, 0)) { - // new_file creation failed every where - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, op_errno); - goto out; - } - afr_sh_entry_impunge_perform_xattrop (impunge_frame, this); - } -out: - return 0; + int ret = -1; + + if (source < 0) + ret = __afr_selfheal_merge_dirent(frame, this, fd, name, inode, sources, + healed_sinks, locked_on, replies); + else + ret = __afr_selfheal_heal_dirent(frame, this, fd, name, inode, source, + sources, healed_sinks, locked_on, + replies); + return ret; } -int -afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +static gf_boolean_t +is_full_heal_marker_present(xlator_t *this, dict_t *xdata, int idx) { - int call_count = 0; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { - //For symlinks impunge is attempted un-conditionally - //So the file can already exist. - if ((op_ret < 0) && (op_errno == EEXIST)) - op_ret = 0; - } + int i = 0; + int pending[3] = { + 0, + }; + void *pending_raw = NULL; + afr_private_t *priv = NULL; - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); + priv = this->private; - return 0; -} + if (!xdata) + return _gf_false; -int -afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - loc_t *loc = NULL; - struct iatt *buf = NULL; - loc_t oldloc = {0}; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - loc = &impunge_local->loc; - buf = &impunge_sh->entrybuf; - - oldloc.inode = inode_ref (loc->inode); - uuid_copy (oldloc.gfid, buf->ia_gfid); - gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s", - loc->path, priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->link, - &oldloc, loc, NULL); - loc_wipe (&oldloc); + /* Iterate over each of the priv->pending_keys[] elements and then + * see if any of them have data segment non-zero. If they do, return + * true. Else return false. + */ + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; - return 0; -} + if (!pending_raw) + continue; -int -afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - if (op_ret < 0) { - afr_sh_entry_impunge_create_file (impunge_frame, this, - (long)cookie); - } else { - afr_sh_entry_impunge_hardlink (impunge_frame, this, - (long)cookie); - } - return 0; -} + memcpy(pending, pending_raw, sizeof(pending)); + if (ntoh32(pending[idx])) + return _gf_true; + } -int -afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame, - xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - loc_t *loc = NULL; - dict_t *xattr_req = NULL; - loc_t oldloc = {0}; - int ret = -1; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - loc = &impunge_local->loc; - - xattr_req = dict_new (); - if (!xattr_req) - goto out; - oldloc.inode = inode_ref (loc->inode); - uuid_copy (oldloc.gfid, stbuf->ia_gfid); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->lookup, - &oldloc, xattr_req); - ret = 0; -out: - if (xattr_req) - dict_unref (xattr_req); - loc_wipe (&oldloc); - if (ret) - sh->impunge_done (frame, this, -1, ENOMEM); - return 0; + return _gf_false; } -int -afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) +static gf_boolean_t +afr_need_full_heal(xlator_t *this, struct afr_reply *replies, int source, + unsigned char *healed_sinks, afr_transaction_type type) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing file %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - dict = dict_new (); - if (!dict) - gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - - GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - impunge_local->loc.path); - - /* - * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : - * - * Problem: - * While a brick is down in a replica pair, lets say the user creates - * one file(file-A) and a hard link to that file(h-file-A). After the - * brick comes back up, entry self-heal is attempted on parent dir of - * these two files. As part of readdir in self-heal it reads both the - * entries file-A and h-file-A for both of them it does name less lookup - * to check if there are any hardlinks already present in the - * destination brick. It finds that there are no hard links already - * present for files file-A, h-file-A. Self-heal does mknods for both - * file-A and h-file-A. This leads to file-A and h-file-A not being - * hardlinks anymore. - * - * Fix: (More like shrinking of race-window, the race itself is still - * present in posix-mknod). - * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then - * posix_mknod checks if there are already any gfid-links and does - * link() instead of mknod. There still can be a race where two - * posix_mknods same gfid see that - * gfid-link file is not present and proceeds with mknods and result in - * two different files with same gfid. - */ - ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", - impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mknod, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - makedev (ia_major (stbuf->ia_rdev), - ia_minor (stbuf->ia_rdev)), 0, dict); - - if (dict) - dict_unref (dict); - - return 0; -} - + int i = 0; + int idx = 0; + afr_private_t *priv = NULL; + priv = this->private; -int -afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; + if (!priv->esh_granular) + return _gf_true; - int ret = 0; + if (type != AFR_ENTRY_TRANSACTION) + return _gf_true; - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; + idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return 0; - } + /* If there is a clear source, check whether the full-heal-indicator + * is present in its xdata. Otherwise, we need to examine all the + * participating bricks and then figure if *even* one of them has a + * full-heal-indicator. + */ - GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - impunge_local->loc.path); + if (source != -1) { + if (is_full_heal_marker_present(this, replies[source].xdata, idx)) + return _gf_true; + } - gf_log (this->name, GF_LOG_DEBUG, - "creating missing directory %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); + /* else ..*/ - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mkdir, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - 0, dict); + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; - if (dict) - dict_unref (dict); + if (is_full_heal_marker_present(this, replies[i].xdata, idx)) + return _gf_true; + } - return 0; + return _gf_false; } - -int -afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, const char *linkname) +static int +__afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + uint64_t *witness) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - struct iatt *buf = NULL; - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - buf = &impunge_local->cont.dir_fop.buf; + afr_private_t *priv = NULL; + int source = -1; + int sources_count = 0; + int i = 0; - dict = dict_new (); - if (!dict) { - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, ENOMEM); - goto out; - } - - GF_ASSERT (!uuid_is_null (buf->ia_gfid)); - ret = afr_set_dict_gfid (dict, buf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, - "%s: dict set gfid failed", - impunge_local->loc.path); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing symlink %s -> %s on %s", - impunge_local->loc.path, linkname, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc, 0, dict); - - if (dict) - dict_unref (dict); -out: - return 0; -} + priv = this->private; + sources_count = AFR_COUNT(sources, priv->child_count); -int -afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "unlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count || afr_does_witness_exist(this, witness)) { + memset(sources, 0, sizeof(*sources) * priv->child_count); + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return -1; + } - afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, - impunge_sh->linkname); + source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION); - return 0; -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; + /*If the selected source does not blame any other brick, then mark + * everything as sink to trigger conservative merge. + */ + if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) { + sources[i] = 0; + healed_sinks[i] = 1; + } } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); + return -1; + } - return 0; + return source; } - int -afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + struct afr_reply *replies, int *source_p, + unsigned char *pflag) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "unlinking symlink %s with wrong target on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->unlink, - &impunge_local->loc, 0, NULL); - - return 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + uint64_t *witness = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + if (ret) + return ret; + + witness = alloca0(sizeof(*witness) * priv->child_count); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_ENTRY_TRANSACTION, locked_on, sources, + sinks, witness, pflag); + if (ret) + return ret; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + source = __afr_selfheal_entry_finalize_source(this, sources, healed_sinks, + locked_on, replies, witness); + + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; + + return ret; } - -int -afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_INFO, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - /* symlink doesn't exist on the sink */ - - if ((op_ret == -1) && (op_errno == ENOENT)) { - afr_sh_entry_impunge_symlink (impunge_frame, this, - child_index, impunge_sh->linkname); - return 0; - } - - - /* symlink exists on the sink, so check if targets match */ - - if (strcmp (linkname, impunge_sh->linkname) == 0) { - /* targets match, nothing to do */ - - goto out; - } else { - /* - * Hah! Sneaky wolf in sheep's clothing! - */ - afr_sh_entry_impunge_symlink_unlink (impunge_frame, this, - child_index); - return 0; - } - -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +static int +afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *parent_idx_inode, + xlator_t *subvol, gf_boolean_t full_crawl) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "checking symlink target of %s on %s", - impunge_local->loc.path, priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readlink, - &impunge_local->loc, 4096, NULL); - + int ret = 0; + int source = -1; + unsigned char *locked_on = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + inode_t *inode = NULL; + struct afr_reply *replies = NULL; + struct afr_reply *par_replies = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + + priv = this->private; + + if (afr_is_private_directory(priv, fd->inode->gfid, name, + GF_CLIENT_PID_SELF_HEALD)) { return 0; + } + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + dict_unref(xattr); + return -1; + } + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + locked_on = alloca0(priv->child_count); + + replies = alloca0(priv->child_count * sizeof(*replies)); + par_replies = alloca0(priv->child_count * sizeof(*par_replies)); + + ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, + locked_on); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "entry self-heal as only %d sub-volumes " + " could be locked in %s domain", + uuid_utoa(fd->inode->gfid), ret, this->name); + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, locked_on, + sources, sinks, healed_sinks, + par_replies, &source, NULL); + if (ret < 0) + goto unlock; + + inode = afr_selfheal_unlocked_lookup_on(frame, fd->inode, name, replies, + locked_on, xattr); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_entry_dirent(frame, this, fd, name, inode, source, + sources, healed_sinks, locked_on, + replies); + + if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) { + ret = afr_shd_entry_purge(subvol, parent_idx_inode, name, + inode->ia_type); + /* Why is ret force-set to 0? We do not care about + * index purge failing for full heal as it is quite + * possible during replace-brick that not all files + * and directories have their name indices present in + * entry-changes/. + */ + ret = 0; + } + } + +unlock: + afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, locked_on, + NULL); + if (inode) + inode_unref(inode); + if (replies) + afr_replies_wipe(replies, priv->child_count); + if (par_replies) + afr_replies_wipe(par_replies, priv->child_count); + if (xattr) + dict_unref(xattr); + + return ret; } - -int -afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf, dict_t *xdata) +static inode_t * +afr_shd_entry_changes_index_inode(xlator_t *this, xlator_t *subvol, + uuid_t pargfid) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - impunge_sh->linkname = gf_strdup (linkname); - afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); - - return 0; + int ret = -1; + void *index_gfid = NULL; + loc_t rootloc = { + 0, + }; + loc_t loc = { + 0, + }; + dict_t *xattr = NULL; + inode_t *inode = NULL; + struct iatt iatt = { + 0, + }; + + rootloc.inode = inode_ref(this->itable->root); + gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr(subvol, &rootloc, &xattr, + GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr(xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid); + if (ret) { + errno = EINVAL; + goto out; + } + + loc.inode = inode_new(this->itable); + if (!loc.inode) { + errno = ENOMEM; + goto out; + } + + gf_uuid_copy(loc.pargfid, index_gfid); + loc.name = gf_strdup(uuid_utoa(pargfid)); + + ret = syncop_lookup(subvol, &loc, &iatt, NULL, NULL, NULL); + if (ret < 0) { + errno = -ret; + goto out; + } + + inode = inode_link(loc.inode, NULL, NULL, &iatt); out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + if (xattr) + dict_unref(xattr); + loc_wipe(&rootloc); + GF_FREE((char *)loc.name); + loc_wipe(&loc); - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - - return 0; + return inode; } - -int -afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) +static int +afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd, + int child) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - impunge_local->cont.dir_fop.buf = *stbuf; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096, NULL); + int ret = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + off_t offset = 0; + call_frame_t *iter_frame = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + gf_boolean_t mismatch = _gf_false; + afr_local_t *local = NULL; + loc_t loc = { + 0, + }; + + priv = this->private; + subvol = priv->children[child]; + + INIT_LIST_HEAD(&entries.list); + + local = frame->local; + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) + return -ENOMEM; + + loc.inode = afr_shd_entry_changes_index_inode(this, subvol, + fd->inode->gfid); + + while ((ret = syncop_readdir(subvol, fd, 131072, offset, &entries, NULL, + NULL))) { + if (ret > 0) + ret = 0; + list_for_each_entry(entry, &entries.list, list) + { + offset = entry->d_off; - return 0; -} + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t type = IA_INVAL; - int active_src = 0; - struct iatt *buf = NULL; - - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf, - &impunge_sh->parentbuf); - - buf = &impunge_sh->entrybuf; - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_impunge_check_hardlink (impunge_frame, this, - child_index, buf); + ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name, + loc.inode, subvol, + local->need_full_crawl); + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = -ENOTCONN; break; - case IA_IFDIR: - afr_sh_entry_impunge_mkdir (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - sh->impunge_done (frame, this, -1, EINVAL); + } + + if (ret == -1) { + /* gfid or type mismatch. */ + mismatch = _gf_true; + ret = 0; + } + if (ret) break; } - return 0; -} - -int -afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t type = IA_INVAL; - int active_src = 0; - struct iatt *buf = NULL; - - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - buf = &impunge_sh->entrybuf; - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - afr_sh_entry_impunge_mknod (impunge_frame, this, - child_index, buf); - break; - case IA_IFLNK: - afr_sh_entry_impunge_readlink (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - sh->impunge_done (frame, this, -1, EINVAL); - break; - } - - return 0; -} - -gf_boolean_t -afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child, - unsigned int child_count) -{ - gf_boolean_t recreate = _gf_false; - - GF_ASSERT (impunge_sh->child_errno); - - if (child == impunge_sh->active_source) - goto out; + gf_dirent_free(&entries); + if (ret) + break; + } - if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { - recreate = _gf_true; - goto out; - } + loc_wipe(&loc); - if (impunge_sh->child_errno[child] == ENOENT) - recreate = _gf_true; -out: - return recreate; + AFR_STACK_DESTROY(iter_frame); + if (mismatch == _gf_true) + /* undo pending will be skipped */ + ret = -1; + return ret; } -unsigned int -afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources, - unsigned int child_count) +static int +afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, + loc_t *parent, void *data) { - int count = 0; - int i = 0; + int ret = 0; + loc_t loc = { + 0, + }; + struct iatt iatt = { + 0, + }; + afr_granular_esh_args_t *args = data; + + /* Look up the actual inode associated with entry. If the lookup returns + * ESTALE or ENOENT, then it means we have a stale index. Remove it. + * This is analogous to the check in afr_shd_index_heal() except that + * here it is achieved through LOOKUP and in afr_shd_index_heal() through + * a GETXATTR. + */ + + loc.inode = inode_new(args->xl->itable); + loc.parent = inode_ref(args->heal_fd->inode); + gf_uuid_copy(loc.pargfid, loc.parent->gfid); + loc.name = entry->d_name; + + ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL); + if ((ret == -ENOENT) || (ret == -ESTALE)) { + /* The name indices under the pgfid index dir are guaranteed + * to be regular files. Hence the hardcoding. + */ + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG); + ret = 0; + goto out; + } + /* TBD: afr_shd_zero_xattrop? */ - for (i = 0; i < child_count; i++) { - if (afr_sh_need_recreate (impunge_sh, i, child_count)) - count++; - } + ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd, + entry->d_name, parent->inode, subvol, + _gf_false); + AFR_STACK_RESET(args->frame); + if (args->frame->local == NULL) + ret = -ENOTCONN; - return count; -} + if (ret == -1) + args->mismatch = _gf_true; -int -afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, - xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - unsigned int recreate_count = 0; - int i = 0; - int active_src = 0; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - impunge_sh->entrybuf = impunge_sh->buf[active_src]; - impunge_sh->parentbuf = impunge_sh->parentbufs[active_src]; - recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, - priv->child_count); - if (!recreate_count) { - afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0); - goto out; - } - impunge_local->call_count = recreate_count; - for (i = 0; i < priv->child_count; i++) { - if (!impunge_local->child_up[i]) { - impunge_sh->child_errno[i] = ENOTCONN; - continue; - } - if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) { - impunge_sh->child_errno[i] = EEXIST; - continue; - } - } - for (i = 0; i < priv->child_count; i++) { - if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) - continue; - (void)afr_sh_entry_impunge_create (impunge_frame, this, i); - recreate_count--; - } - GF_ASSERT (!recreate_count); out: - return 0; -} - -void -afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - unsigned int gfid_miss_count = 0; - unsigned int children_up_count = 0; - uuid_t gfid = {0}; - int active_src = 0; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - - if (op_ret < 0) - goto done; - if (impunge_sh->child_errno[active_src]) { - op_ret = -1; - op_errno = impunge_sh->child_errno[active_src]; - goto done; - } - - gfid_miss_count = afr_gfid_missing_count (this->name, - impunge_sh->success_children, - impunge_sh->buf, priv->child_count, - impunge_local->loc.path); - children_up_count = afr_up_children_count (impunge_local->child_up, - priv->child_count); - if ((gfid_miss_count == children_up_count) && - (children_up_count < priv->child_count)) { - op_ret = -1; - op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "Not all children are up, " - "gfid should not be assigned in this state for %s", - impunge_local->loc.path); - goto done; - } - - if (gfid_miss_count) { - afr_update_gfid_from_iatts (gfid, impunge_sh->buf, - impunge_sh->success_children, - priv->child_count); - if (uuid_is_null (gfid)) { - sh->entries_skipped = _gf_true; - gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry " - "self-heal because of gfid absence", - impunge_local->loc.path); - goto done; - } - afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, - afr_sh_entry_common_lookup_done, gfid, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } else { - afr_sh_entry_call_impunge_recreate (impunge_frame, this); - } - return; -done: - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - return; + loc_wipe(&loc); + return 0; } -int -afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) +static int +afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd, + int subvol_idx, gf_boolean_t is_src) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int ret = -1; - call_frame_t *impunge_frame = NULL; - afr_local_t *impunge_local = NULL; - int active_src = 0; - int op_errno = 0; - int op_ret = -1; - - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - sh->impunge_done = afr_sh_entry_impunge_entry_done; - - if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { - op_ret = 0; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existence of %s under %s", - entry->d_name, local->loc.path); - - ret = afr_impunge_frame_create (frame, this, active_src, - &impunge_frame); - if (ret) { - op_errno = -ret; - goto out; - } + int ret = 0; + loc_t loc = { + 0, + }; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + afr_granular_esh_args_t args = { + 0, + }; + + priv = this->private; + subvol = priv->children[subvol_idx]; + + args.frame = afr_copy_frame(frame); + if (!args.frame) + goto out; + args.xl = this; + /* args.heal_fd represents the fd associated with the original directory + * on which entry heal is being attempted. + */ + args.heal_fd = fd; + + /* @subvol here represents the subvolume of AFR where + * indices/entry-changes/<pargfid> will be processed + */ + loc.inode = afr_shd_entry_changes_index_inode(this, subvol, + fd->inode->gfid); + if (!loc.inode) { + /* If granular heal failed on the sink (as it might sometimes + * because it is the src that would mostly contain the granular + * changelogs and the sink's entry-changes would be empty), + * do not treat heal as failure. + */ + if (is_src) + ret = -errno; + else + ret = 0; + goto out; + } - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc, - entry->d_name); - loc_copy (&impunge_sh->parent_loc, &local->loc); - if (ret != 0) { - op_errno = ENOMEM; - goto out; - } + ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args, + afr_selfheal_entry_granular_dirent); - afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, - afr_sh_entry_common_lookup_done, NULL, - AFR_LOOKUP_FAIL_CONFLICTS, NULL); + loc_wipe(&loc); - op_ret = 0; + if (args.mismatch == _gf_true) + ret = -1; out: - if (ret) { - if (impunge_frame) - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, op_ret, op_errno); - } - - return 0; -} - - -int -afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_impunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_DEBUG, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_impunge_entry (frame, this, entry); - } - - return 0; -} - - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t active_src = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; - gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd", - local->loc.path, sh->offset); - - STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset, NULL); - - return 0; -} - - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->offset = 0; - - active_src = next_active_source (frame, this, sh->active_source); - sh->active_source = active_src; - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_entry_finish (frame, this); - return 0; - } - - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - afr_sh_entry_erase_pending (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "impunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); - - afr_sh_entry_impunge_subvol (frame, this); - - return 0; + if (args.frame) + AFR_STACK_DESTROY(args.frame); + return ret; } - -int -afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "opendir of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_entry_finish (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - sh->active_source = -1; - afr_sh_entry_expunge_all (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_open (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + unsigned char *sources, unsigned char *healed_sinks) { - int i = 0; - int call_count = 0; - - int source = -1; - int *sources = NULL; - - fd_t *fd = NULL; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = local->self_heal.source; - sources = local->self_heal.sources; - - sh->block_size = priv->sh_readdir_size; - sh->offset = 0; - - call_count = sh->active_sinks; - if (source != -1) - call_count++; - - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - if (source != -1) { - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (source)", - local->loc.path, priv->children[source]->name); - - /* open source */ - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->opendir, - &local->loc, fd, NULL); - call_count--; - } - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (sink)", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - &local->loc, fd, NULL); - - if (!--call_count) - break; - } - - return 0; -} - + int i = 0; + int ret = 0; + gf_boolean_t mismatch = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid)); + + for (i = 0; i < priv->child_count; i++) { + /* Expunge */ + if (!healed_sinks[i]) + continue; + + if (!local->need_full_crawl) + /* Why call afr_selfheal_entry_granular() on a "healed sink", + * given that it is the source that contains the granular + * indices? + * If the index for this directory is non-existent or empty on + * this subvol (=> clear sink), then it will return early + * without failure status. + * If the index is non-empty and it is yet a 'healed sink', then + * it is due to a split-brain in which case we anyway need to + * crawl the indices/entry-changes/pargfid directory. + */ + ret = afr_selfheal_entry_granular(frame, this, fd, i, _gf_false); + else + ret = afr_selfheal_entry_do_subvol(frame, this, fd, i); -int -afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - afr_sh_mark_source_sinks (frame, this); - if (source != -1) - sh->success[source] = 1; - - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for self-heal on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - if (source == -1 && sh->active_sinks < 2) { - gf_log (this->name, GF_LOG_TRACE, - "cannot sync with 0 sources and 1 sink on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; + if (ret == -1) { + /* gfid or type mismatch. */ + mismatch = _gf_true; + ret = 0; } + if (ret) + break; + } - if (source != -1) - gf_log (this->name, GF_LOG_DEBUG, - "self-healing directory %s from subvolume %s to " - "%d other", - local->loc.path, priv->children[source]->name, - sh->active_sinks); + if (!ret && source != -1) { + /* Impunge */ + if (local->need_full_crawl) + ret = afr_selfheal_entry_do_subvol(frame, this, fd, source); else - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s found. " - "merging all entries as a conservative decision", - local->loc.path); - - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_sh_entry_open (frame, this); - - return 0; + ret = afr_selfheal_entry_granular(frame, this, fd, source, + _gf_true); + } + + if (mismatch == _gf_true) + /* undo pending will be skipped */ + ret = -1; + return ret; } - -void -afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static int +__afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int nsources = 0; - int32_t subvol_status = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_entry_finish (frame, this); - goto out; - } - - if (sh->forced_merge) { - sh->source = -1; - goto heal; - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_ENTRY_TRANSACTION, &subvol_status, - _gf_true); - if ((subvol_status & ALL_FOOLS) || - (subvol_status & SPLIT_BRAIN)) { - gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " - "merge", local->loc.path); - source = -1; - memset (sh->sources, 0, - sizeof (*sh->sources) * priv->child_count); - } else if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_entry_finish (frame, this); - return; - } else { - source = afr_sh_select_source (sh->sources, priv->child_count); - } - - sh->source = source; - - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - if (sh->source >= 0) - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - -heal: - afr_sh_entry_sync_prepare (frame, this); + int ret = -1; + int source = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *postop_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t did_sh = _gf_true; + + priv = this->private; + local = frame->local; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + postop_lock = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, + data_lock); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "entry self-heal as only %d sub-volumes could " + "be locked in %s domain", + uuid_utoa(fd->inode->gfid), ret, this->name); + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, data_lock, + sources, sinks, healed_sinks, + locked_replies, &source, NULL); + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; + } + + local->need_full_crawl = afr_need_full_heal( + this, locked_replies, source, healed_sinks, AFR_ENTRY_TRANSACTION); + } +unlock: + afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, data_lock, + NULL); + if (ret < 0) + goto out; + + if (!did_sh) + goto out; + + ret = afr_selfheal_entry_do(frame, this, fd, source, sources, healed_sinks); + if (ret) + goto out; + + /* Take entrylks in xlator domain before doing post-op (undo-pending) in + * entry self-heal. This is to prevent a parallel name self-heal on + * an entry under @fd->inode from reading pending xattrs while it is + * being modified by SHD after entry sh below, given that + * name self-heal takes locks ONLY in xlator domain and is free to read + * pending changelog in the absence of the following locking. + */ + ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, + postop_lock); + { + if (AFR_CMP(data_lock, postop_lock, priv->child_count) != 0) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "post-op after entry self-heal as %d " + "sub-volumes, as opposed to %d, " + "could be locked in %s domain", + uuid_utoa(fd->inode->gfid), ret, + AFR_COUNT(data_lock, priv->child_count), this->name); + ret = -ENOTCONN; + goto postop_unlock; + } + + afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks, + locked_replies); + ret = afr_selfheal_undo_pending( + frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending, + AFR_ENTRY_TRANSACTION, locked_replies, postop_lock); + } +postop_unlock: + afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, + postop_lock, NULL); out: - return; + if (did_sh) + afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources, + healed_sinks); + else + ret = 1; + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + return ret; } -int -afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) +static fd_t * +afr_selfheal_data_opendir(xlator_t *this, inode_t *inode) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " - "failed for %s.", local->loc.path); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_entry_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done " - "for %s. Proceeding to FOP", local->loc.path); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_entry_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } - - return 0; + loc_t loc = { + 0, + }; + int ret = 0; + fd_t *fd = NULL; + + fd = fd_create(inode, 0); + if (!fd) + return NULL; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + ret = syncop_opendir(this, &loc, fd, NULL, NULL); + if (ret) { + fd_unref(fd); + fd = NULL; + } else { + fd_bind(fd); + } + + loc_wipe(&loc); + return fd; } int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this) +afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; - - if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - afr_sh_entrylk (frame, this, &local->loc, NULL, - afr_sh_post_nonblocking_entry_cbk); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to completion on %s", - local->loc.path); - afr_sh_entry_done (frame, this); - } - - return 0; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + fd_t *fd = NULL; + int ret = 0; + + priv = this->private; + + fd = afr_selfheal_data_opendir(this, inode); + if (!fd) + return -EIO; + + locked_on = alloca0(priv->child_count); + + ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain, + NULL, locked_on); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "entry self-heal as only %d sub-volumes could " + "be locked in %s domain", + uuid_utoa(fd->inode->gfid), ret, priv->sh_domain); + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_entry(frame, this, fd, locked_on); + } +unlock: + afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, locked_on, + NULL); + + if (fd) + fd_unref(fd); + + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index fd5da6cfd33..03f43bad16e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,763 +8,539 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - -int -afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_reset (frame, this); - if (IA_ISDIR (sh->type)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to data check on %s", - local->loc.path); - afr_self_heal_data (frame, this); - } - - return 0; -} - -int -afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_metadata_done; - afr_unlock (frame, this); - - return 0; -} - -int -afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) -{ - afr_sh_inode_unlock (frame, this); +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include <glusterfs/events.h> - return 0; -} +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE) -int -afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_metadata_finish (frame, this); - return 0; + return afr_is_xattr_ignorable(k); } -int -afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - afr_local_t *local = NULL; - int call_count = 0; - long i = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && - (!IA_ISDIR (sh->buf[sh->source].ia_type))) { - afr_children_add_child (sh->fresh_children, i, - priv->child_count); - } - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && - (!IA_ISDIR (sh->buf[sh->source].ia_type))) { - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - } - afr_sh_metadata_finish (frame, this); - } - - return 0; -} - -int -afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) +void +afr_delete_ignorable_xattrs(dict_t *xattr) { - afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION, - afr_sh_metadata_erase_pending_cbk, - afr_sh_metadata_finish); - return 0; + dict_foreach_match(xattr, _afr_ignorable_key_match, NULL, + dict_remove_foreach_fn, NULL); } - int -afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "setting attributes failed for %s on %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->success[child_index] = 0; - } + int ret = -1; + loc_t loc = { + 0, + }; + dict_t *xattr = NULL; + dict_t *old_xattr = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing metadata selfheal on %s", uuid_utoa(inode->gfid)); + + ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL, + NULL); + if (ret < 0) { + ret = -EIO; + goto out; + } + + afr_delete_ignorable_xattrs(xattr); + + for (i = 0; i < priv->child_count; i++) { + if (old_xattr) { + dict_unref(old_xattr); + old_xattr = NULL; } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (local->xattr_req) { - dict_unref (local->xattr_req); - local->xattr_req = NULL; - } - afr_sh_metadata_erase_pending (frame, this); + if (!healed_sinks[i]) + continue; + + ret = syncop_setattr(priv->children[i], &loc, + &locked_replies[source].poststat, AFR_HEAL_ATTR, + NULL, NULL, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + + ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL, + NULL); + if (old_xattr) { + afr_delete_ignorable_xattrs(old_xattr); + ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr, + NULL); + if (ret) + healed_sinks[i] = 0; } - return 0; -} - - -int -afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); + ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + } + ret = 0; - return 0; -} - - -int -afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); +out: + loc_wipe(&loc); + if (xattr) + dict_unref(xattr); + if (old_xattr) + dict_unref(old_xattr); - return 0; + return ret; } -int -afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xdata) +static uint64_t +mtime_ns(struct iatt *ia) { - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; + uint64_t ret; - if (op_ret < 0) { - afr_sh_metadata_sync_cbk (frame, cookie, - this, -1, op_errno, xdata); - goto out; - } - - i = (long) cookie; + ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) + + (uint64_t)(ia->ia_mtime_nsec); - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, local->xattr_req, 0, NULL); - - out: - return 0; + return ret; } -inline void -afr_prune_special_keys (dict_t *xattr_dict) +/* + * When directory content is modified, [mc]time is updated. On + * Linux, the filesystem does it, while at least on NetBSD, the + * kernel file-system independent code does it. This means that + * when entries are added while bricks are down, the kernel sends + * a SETATTR [mc]time which will cause metadata split brain for + * the directory. In this case, clear the split brain by finding + * the source with the most recent modification date. + */ +static int +afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + unsigned char *locked_on) { - dict_del (xattr_dict, GF_SELINUX_XATTR_KEY); + afr_private_t *priv = NULL; + int source = -1; + struct iatt source_ia; + struct iatt child_ia; + uint64_t mtime = 0; + int i; + int ret = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + if (mtime_ns(&replies[i].poststat) <= mtime) + continue; + + mtime = mtime_ns(&replies[i].poststat); + source = i; + } + + if (source == -1) + goto out; + + source_ia = replies[source].poststat; + if (source_ia.ia_type != IA_IFDIR) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + child_ia = replies[i].poststat; + + if (!IA_EQUAL(source_ia, child_ia, gfid) || + !IA_EQUAL(source_ia, child_ia, type) || + !IA_EQUAL(source_ia, child_ia, prot) || + !IA_EQUAL(source_ia, child_ia, uid) || + !IA_EQUAL(source_ia, child_ia, gid) || + !afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) + goto out; + } + + /* + * Metadata split brain is just about [amc]time + * We return our source. + */ + ret = source; +out: + return ret; } -inline void -afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv) +static int +__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, + struct afr_reply *replies, + unsigned char *sources) { - int i = 0; - - for (; i < priv->child_count; i++) { - dict_del (xattr_dict, priv->pending_key[i]); + int ret = 0; + int i = 0; + int m_idx = 0; + afr_private_t *priv = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = {0}; + dict_t *xattr = NULL; + + priv = this->private; + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + raw[m_idx] = 1; + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) + continue; + ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + ret = -1; + goto out; } -} - -int -afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) -{ - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (op_ret < 0) { - afr_sh_metadata_sync_cbk (frame, cookie, - this, -1, op_errno, xdata); - goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO, + "Failed to set pending metadata xattr on child %d for %s", i, + uuid_utoa(inode->gfid)); + goto out; } + } - afr_prune_pending_keys (xattr, priv); - - afr_prune_special_keys (xattr); + afr_replies_wipe(replies, priv->child_count); + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); - i = (long) cookie; - - /* send removexattr in bulk via xdata */ - STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk, - cookie, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, "", xattr); - - out: - return 0; -} - -int -afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int active_sinks = 0; - int call_count = 0; - int i = 0; - - struct iatt stbuf = {0,}; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - active_sinks = sh->active_sinks; - - /* - * 2 calls per sink - setattr, setxattr - */ - if (xattr) { - call_count = active_sinks * 2; - local->xattr_req = dict_ref (xattr); - } else - call_count = active_sinks; - - local->call_count = call_count; - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - - stbuf.ia_uid = sh->buf[source].ia_uid; - stbuf.ia_gid = sh->buf[source].ia_gid; - - stbuf.ia_type = sh->buf[source].ia_type; - stbuf.ia_prot = sh->buf[source].ia_prot; - - valid = GF_SET_ATTR_MODE | - GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - for (i = 0; i < priv->child_count; i++) { - if (call_count == 0) { - break; - } - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing metadata of %s from %s to %s", - local->loc.path, priv->children[source]->name, - priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid, NULL); - - call_count--; - - if (!xattr) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->getxattr, - &local->loc, NULL, NULL); - call_count--; - } - - return 0; -} - - -int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", - local->loc.path, priv->children[source]->name, - strerror (op_errno)); - - afr_sh_metadata_sync (frame, this, NULL); - } else { - afr_prune_pending_keys (xattr, priv); - afr_sh_metadata_sync (frame, this, xattr); - } - - return 0; +out: + if (xattr) + dict_unref(xattr); + return ret; } -static void -afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, - xlator_t *this) +/* + * Look for mismatching uid/gid or mode or user xattrs even if + * AFR xattrs don't say so, and pick one arbitrarily as winner. */ + +static int +__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *string = NULL; - size_t off = 0; - char *source_child = " from source %s to"; - char *format = " %s, "; - char *string_msg = " metadata self heal"; - char *pending_matrix_str = NULL; - int down_child_present = 0; - int unknown_child_present = 0; - char *down_subvol_1 = " down subvolume is "; - char *unknown_subvol_1 = " unknown subvolume is"; - char *down_subvol_2 = " down subvolumes are "; - char *unknown_subvol_2 = " unknown subvolumes are "; - int down_count = 0; - int unknown_count = 0; - - priv = this->private; - - pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, - this); - - if (!pending_matrix_str) - pending_matrix_str = ""; - - len += snprintf (num, sizeof (num), "%s", string_msg); - - for (i = 0; i < priv->child_count; i++) { - if ((sh->source == i) && (local->child_up[i] == 1)) { - len += snprintf (num, sizeof (num), source_child, - priv->children[i]->name); - } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - } else if (local->child_up[i] == 0) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - if (!down_child_present) - down_child_present = 1; - down_count++; - } else if (local->child_up[i] == -1) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - if (!unknown_child_present) - unknown_child_present = 1; - unknown_count++; - } - } - - if (down_child_present) { - if (down_count > 1) { - len += snprintf (num, sizeof (num), "%s", - down_subvol_2); - } else { - len += snprintf (num, sizeof (num), "%s", - down_subvol_1); - } - } - if (unknown_child_present) { - if (unknown_count > 1) { - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_2); - } else { - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_1); - } - } - - len ++; - - string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - if (!string) - return; - - off += snprintf (string + off, len - off, "%s", string_msg); - for (i=0; i < priv->child_count; i++) { - if ((sh->source == i) && (local->child_up[i] == 1)) - off += snprintf (string + off, len - off, source_child, - priv->children[i]->name); + int i = 0; + afr_private_t *priv = NULL; + struct iatt srcstat = { + 0, + }; + int source = -1; + int sources_count = 0; + int ret = 0; + + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count) { + source = afr_mark_split_brain_source_sinks( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, AFR_METADATA_TRANSACTION); + if (source >= 0) { + _afr_fav_child_reset_sink_xattrs( + frame, this, inode, source, healed_sinks, undid_pending, + AFR_METADATA_TRANSACTION, locked_on, replies); + goto out; } - for (i = 0; i < priv->child_count; i++) { - if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); + /* If this is a directory mtime/ctime only split brain + use the most recent */ + source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on); + if (source != -1) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN, + "clear time " + "split brain on %s", + uuid_utoa(replies[source].poststat.ia_gfid)); + sources[source] = 1; + healed_sinks[source] = 0; + goto out; } - if (down_child_present) { - if (down_count > 1) { - off += snprintf (string + off, len - off, "%s", - down_subvol_2); - } else { - off += snprintf (string + off, len - off, "%s", - down_subvol_1); - } + if (!priv->metadata_splitbrain_forced_heal) { + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;" + "type=metadata;file=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); + return -EIO; } + /* Metadata split brain, select one subvol + arbitrarily */ for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 0) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); + if (locked_on[i] && healed_sinks[i]) { + sources[i] = 1; + healed_sinks[i] = 0; + break; + } } - - if (unknown_child_present) { - if (unknown_count > 1) { - off += snprintf (string + off, len - off, "%s", - unknown_subvol_2); - } else { - off += snprintf (string + off, len - off, "%s", - unknown_subvol_1); - } + } + + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + + source = afr_choose_source_by_policy(priv, sources, + AFR_METADATA_TRANSACTION); + srcstat = replies[source].poststat; + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] || i == source) + continue; + if (!IA_EQUAL(srcstat, replies[i].poststat, type) || + !IA_EQUAL(srcstat, replies[i].poststat, uid) || + !IA_EQUAL(srcstat, replies[i].poststat, gid) || + !IA_EQUAL(srcstat, replies[i].poststat, prot)) { + gf_msg_debug(this->name, 0, + "%s: iatt mismatch " + "for source(%d) vs (%d)", + uuid_utoa(replies[source].poststat.ia_gfid), source, + i); + sources[i] = 0; + healed_sinks[i] = 1; } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == -1) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] || i == source) + continue; + if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) { + gf_msg_debug(this->name, 0, + "%s: xattr mismatch " + "for source(%d) vs (%d)", + uuid_utoa(replies[source].poststat.ia_gfid), source, + i); + sources[i] = 0; + healed_sinks[i] = 1; } - - gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, - pending_matrix_str); - - if (pending_matrix_str && strcmp (pending_matrix_str, "")) - GF_FREE (pending_matrix_str); - - if (string && strcmp (string, "")) - GF_FREE (string); + } + if ((sources_count == priv->child_count) && (source > -1) && + (AFR_COUNT(healed_sinks, priv->child_count) != 0)) { + ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode, + replies, sources); + if (ret < 0) + return ret; + } +out: + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return source; } int -afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *pflag) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - afr_sh_mark_source_sinks (frame, this); - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_metadata_finish (frame, this); - return 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + uint64_t *witness = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + if (ret) + return ret; + + witness = alloca0(sizeof(*witness) * priv->child_count); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_METADATA_TRANSACTION, locked_on, + sources, sinks, witness, pflag); + if (ret) + return ret; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + /* If any source has witness, pick first + * witness source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && witness[i]) { + source = i; + break; } + } - gf_log (this->name, GF_LOG_TRACE, - "syncing metadata of %s from subvolume %s to %d active sinks", - local->loc.path, priv->children[source]->name, - sh->active_sinks); - - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_set_metadata_sh_info_str (local, sh, this); - STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, - priv->children[source], - priv->children[source]->fops->getxattr, - &local->loc, NULL, NULL); - - return 0; -} - - -void -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_metadata_finish (frame, this); - goto out; - } - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_METADATA_TRANSACTION, NULL, _gf_false); - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_WARNING, - "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - afr_sh_print_split_brain_log (sh->pending_matrix, this, - local->loc.path); - afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); - afr_sh_metadata_fail (frame, this); - goto out; - } - - afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - goto out; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_metadata_finish (frame, this); - goto out; - } - - sh->source = source; - - /* detect changes not visible through pending flags -- JIC */ + if (source != -1) { for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - - if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; + if (i != source && sources[i]) { + sources[i] = 0; + healed_sinks[i] = 1; + } } + } - if ((!IA_ISREG (sh->buf[source].ia_type)) && - (!IA_ISDIR (sh->buf[source].ia_type))) { - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - } + source = __afr_selfheal_metadata_finalize_source( + frame, this, inode, sources, sinks, healed_sinks, undid_pending, + locked_on, replies); - if (sh->do_metadata_self_heal && priv->metadata_self_heal) - afr_sh_metadata_sync_prepare (frame, this); - else - afr_sh_metadata_finish (frame, this); -out: - return; -} - -int -afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, - xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " - "inodelks failed for %s.", local->loc.path); - gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal " - "failed for %s.", local->loc.path); - afr_sh_metadata_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " - "inodelks done for %s. Proceeding to FOP", - local->loc.path); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_metadata_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } + if (source < 0) + return -EIO; - return 0; + return source; } int -afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->domain = this->name; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; - - afr_set_lock_number (frame, this); + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + gf_boolean_t did_sh = _gf_true; + int source = -1; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + { + if (ret < priv->child_count) { + ret = -ENOTCONN; + goto unlock; + } - inodelk->flock.l_start = LLONG_MAX - 1; - inodelk->flock.l_len = 0; - inodelk->flock.l_type = F_WRLCK; - int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; + ret = __afr_selfheal_metadata_prepare( + frame, this, inode, data_lock, sources, sinks, healed_sinks, + undid_pending, locked_replies, NULL); + if (ret < 0) + goto unlock; - afr_nonblocking_inodelk (frame, this); + source = ret; - return 0; -} + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; + } -gf_boolean_t -afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv) -{ - if (sh->force_confirm_spb) - return _gf_true; - if (sh->do_metadata_self_heal && priv->metadata_self_heal) - return _gf_true; - return _gf_false; + ret = __afr_selfheal_metadata_do(frame, this, inode, source, + healed_sinks, locked_replies); + if (ret) + goto unlock; + + afr_selfheal_restore_time(frame, this, inode, source, healed_sinks, + locked_replies); + + ret = afr_selfheal_undo_pending( + frame, this, inode, sources, sinks, healed_sinks, undid_pending, + AFR_METADATA_TRANSACTION, locked_replies, data_lock); + } +unlock: + afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + + if (did_sh) + afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources, + healed_sinks); + else + ret = 1; + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + return ret; } int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf) { - afr_local_t *local = NULL; - afr_private_t *priv = this->private; - afr_self_heal_t *sh = &local->self_heal; - - local = frame->local; - sh = &local->self_heal; - sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; - - if (afr_can_start_metadata_self_heal (sh, priv)) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - afr_sh_metadata_lock (frame, this); - } else { - afr_sh_metadata_done (frame, this); - } - - return 0; + inode_t *inode = NULL; + inode_t *link_inode = NULL; + call_frame_t *frame = NULL; + int ret = 0; + + if (gf_uuid_is_null(stbuf->ia_gfid)) { + ret = -EINVAL; + goto out; + } + + inode = inode_new(this->itable); + if (!inode) { + ret = -ENOMEM; + goto out; + } + + link_inode = inode_link(inode, NULL, NULL, stbuf); + if (!link_inode) { + ret = -ENOMEM; + goto out; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = afr_selfheal_metadata(frame, this, link_inode); +out: + if (inode) + inode_unref(inode); + if (link_inode) + inode_unref(link_inode); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c new file mode 100644 index 00000000000..834aac86d48 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -0,0 +1,616 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/events.h> +#include "afr.h" +#include "afr-self-heal.h" +#include "afr-messages.h" + +int +__afr_selfheal_assign_gfid(xlator_t *this, inode_t *parent, uuid_t pargfid, + const char *bname, inode_t *inode, + struct afr_reply *replies, void *gfid, + unsigned char *locked_on, int source, + unsigned char *sources, gf_boolean_t is_gfid_absent, + int *gfid_idx) +{ + int ret = 0; + int up_count = 0; + int locked_count = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + gf_uuid_copy(parent->gfid, pargfid); + + if (is_gfid_absent) { + /* Ensure all children of AFR are up before performing gfid heal, to + * guard against the possibility of gfid split brain. */ + + up_count = AFR_COUNT(priv->child_up, priv->child_count); + if (up_count != priv->child_count) { + ret = -EIO; + goto out; + } + + locked_count = AFR_COUNT(locked_on, priv->child_count); + if (locked_count != priv->child_count) { + ret = -EIO; + goto out; + } + } + + ret = afr_lookup_and_heal_gfid(this, parent, bname, inode, replies, source, + sources, gfid, gfid_idx); + +out: + return ret; +} + +int +__afr_selfheal_name_impunge(call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, const char *bname, + inode_t *inode, struct afr_reply *replies, + int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + unsigned char *sources = NULL; + + priv = this->private; + + sources = alloca0(priv->child_count); + + gf_uuid_copy(parent->gfid, pargfid); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + + if (gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[gfid_idx].poststat.ia_gfid) == 0) { + sources[i] = 1; + continue; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) + continue; + + ret |= afr_selfheal_recreate_entry(frame, i, gfid_idx, sources, parent, + bname, inode, replies); + } + + return ret; +} + +int +__afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, + const char *bname, inode_t *inode, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret) + continue; + + ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i, + replies); + } + + return ret; +} + +static gf_boolean_t +afr_selfheal_name_need_heal_check(xlator_t *this, struct afr_reply *replies) +{ + int i = 0; + int first_idx = -1; + gf_boolean_t need_heal = _gf_false; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) + need_heal = _gf_true; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + need_heal = _gf_true; + + if (gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + need_heal = _gf_true; + + if ((replies[i].op_ret == 0) && + (gf_uuid_is_null(replies[i].poststat.ia_gfid))) + need_heal = _gf_true; + } + + return need_heal; +} + +static int +afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies, + int source, unsigned char *sources, + uuid_t pargfid, const char *bname) +{ + int i = 0; + int type_idx = -1; + ia_type_t inode_type = IA_INVAL; + ia_type_t inode_type1 = IA_INVAL; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + + if (replies[i].poststat.ia_type == IA_INVAL) + continue; + + if (inode_type == IA_INVAL) { + inode_type = replies[i].poststat.ia_type; + type_idx = i; + continue; + } + inode_type1 = replies[i].poststat.ia_type; + if (sources[i] || source == -1) { + if ((sources[type_idx] || source == -1) && + (inode_type != inode_type1)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN, + "Type mismatch for <gfid:%s>/%s: " + "%s on %s and %s on %s", + uuid_utoa(pargfid), bname, + gf_inode_type_to_str(inode_type1), + priv->children[i]->name, + gf_inode_type_to_str(inode_type), + priv->children[type_idx]->name); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=file;" + "file=<gfid:%s>/%s;count=2;" + "child-%d=%s;type-%d=%s;child-%d=%s;" + "type-%d=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(pargfid), bname, i, priv->children[i]->name, + i, gf_inode_type_to_str(inode_type1), type_idx, + priv->children[type_idx]->name, type_idx, + gf_inode_type_to_str(inode_type)); + return -EIO; + } + inode_type = replies[i].poststat.ia_type; + type_idx = i; + } + } + return 0; +} + +static int +afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies, + int source, unsigned char *sources, + int *gfid_idx, uuid_t pargfid, + const char *bname, inode_t *inode, + unsigned char *locked_on, dict_t *xdata) +{ + int i = 0; + int gfid_idx_iter = -1; + int ret = -1; + void *gfid = NULL; + void *gfid1 = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + + if (gf_uuid_is_null(replies[i].poststat.ia_gfid)) + continue; + + if (!gfid) { + gfid = &replies[i].poststat.ia_gfid; + gfid_idx_iter = i; + continue; + } + + gfid1 = &replies[i].poststat.ia_gfid; + if (sources[i] || source == -1) { + if ((sources[gfid_idx_iter] || source == -1) && + gf_uuid_compare(gfid, gfid1)) { + ret = afr_gfid_split_brain_source(this, replies, inode, pargfid, + bname, gfid_idx_iter, i, + locked_on, gfid_idx, xdata); + if (!ret && *gfid_idx >= 0) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + "GFID split-brain resolved"); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error setting gfid-" + "heal-msg dict"); + } + return ret; + } + gfid = &replies[i].poststat.ia_gfid; + gfid_idx_iter = i; + } + } + + *gfid_idx = gfid_idx_iter; + return 0; +} + +static gf_boolean_t +afr_selfheal_name_source_empty_check(xlator_t *this, struct afr_reply *replies, + unsigned char *sources, int source) +{ + int i = 0; + afr_private_t *priv = NULL; + gf_boolean_t source_is_empty = _gf_true; + + priv = this->private; + + if (source == -1) { + source_is_empty = _gf_false; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + + if (replies[i].op_ret == -1 && replies[i].op_errno == ENOENT) + continue; + + source_is_empty = _gf_false; + break; + } +out: + return source_is_empty; +} + +int +__afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, int source, + unsigned char *locked_on, struct afr_reply *replies, + void *gfid_req, dict_t *xdata) +{ + int gfid_idx = -1; + int ret = -1; + void *gfid = NULL; + gf_boolean_t source_is_empty = _gf_true; + gf_boolean_t need_heal = _gf_false; + gf_boolean_t is_gfid_absent = _gf_false; + + need_heal = afr_selfheal_name_need_heal_check(this, replies); + if (!need_heal) + return 0; + + source_is_empty = afr_selfheal_name_source_empty_check(this, replies, + sources, source); + if (source_is_empty) { + ret = __afr_selfheal_name_expunge(this, parent, pargfid, bname, inode, + replies); + if (ret == -EIO) + ret = -1; + return ret; + } + + ret = afr_selfheal_name_type_mismatch_check(this, replies, source, sources, + pargfid, bname); + if (ret) + return ret; + + ret = afr_selfheal_name_gfid_mismatch_check(this, replies, source, sources, + &gfid_idx, pargfid, bname, + inode, locked_on, xdata); + if (ret) + return ret; + + if (gfid_idx == -1) { + if (!gfid_req || gf_uuid_is_null(gfid_req)) + return -1; + gfid = gfid_req; + } else { + gfid = &replies[gfid_idx].poststat.ia_gfid; + if (source == -1) + /* Either entry split-brain or dirty xattrs are present on parent.*/ + source = gfid_idx; + } + + is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false; + ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode, + replies, gfid, locked_on, source, sources, + is_gfid_absent, &gfid_idx); + if (ret || (gfid_idx < 0)) + return ret; + + ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname, + inode, replies, gfid_idx); + if (ret == -EIO) + ret = -1; + + return ret; +} + +int +__afr_selfheal_name_finalize_source(xlator_t *this, unsigned char *sources, + unsigned char *healed_sinks, + unsigned char *locked_on, uint64_t *witness) +{ + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int sources_count = 0; + + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count || afr_does_witness_exist(this, witness)) { + memset(sources, 0, sizeof(*sources) * priv->child_count); + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return -1; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } + + return source; +} + +int +__afr_selfheal_name_prepare(call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + int *source_p) +{ + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + uint64_t *witness = NULL; + + priv = this->private; + + replies = alloca0(priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_unlocked_discover(frame, parent, pargfid, replies); + if (ret) + goto out; + + witness = alloca0(sizeof(*witness) * priv->child_count); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_ENTRY_TRANSACTION, locked_on, sources, + sinks, witness, NULL); + if (ret) + goto out; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + source = __afr_selfheal_name_finalize_source(this, sources, healed_sinks, + locked_on, witness); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; + +out: + if (replies) + afr_replies_wipe(replies, priv->child_count); + + return ret; +} + +int +afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, void *gfid_req, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *locked_on = NULL; + int source = -1; + struct afr_reply *replies = NULL; + int ret = -1; + inode_t *inode = NULL; + dict_t *xattr = NULL; + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + + ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + dict_unref(xattr); + return -1; + } + + priv = this->private; + + locked_on = alloca0(priv->child_count); + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + + replies = alloca0(priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname, + locked_on); + { + if (ret < priv->child_count) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_name_prepare(frame, this, parent, pargfid, + locked_on, sources, sinks, + healed_sinks, &source); + if (ret) + goto unlock; + + inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies, + locked_on, xattr); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_name_do(frame, this, parent, pargfid, bname, inode, + sources, sinks, healed_sinks, source, + locked_on, replies, gfid_req, xdata); + } +unlock: + afr_selfheal_unentrylk(frame, this, parent, this->name, bname, locked_on, + NULL); + if (inode) + inode_unref(inode); + + if (replies) + afr_replies_wipe(replies, priv->child_count); + if (xattr) + dict_unref(xattr); + + return ret; +} + +int +afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, + const char *bname, gf_boolean_t *need_heal) +{ + afr_private_t *priv = NULL; + int i = 0; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + int first_idx = -1; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + replies = alloca0(sizeof(*replies) * priv->child_count); + + inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies, + local->child_up, NULL); + if (!inode) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) { + *need_heal = _gf_true; + break; + } + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) { + *need_heal = _gf_true; + break; + } + + if (gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) { + *need_heal = _gf_true; + break; + } + } + + if (inode) + inode_unref(inode); + if (replies) + afr_replies_wipe(replies, priv->child_count); + return 0; +} + +int +afr_selfheal_name(xlator_t *this, uuid_t pargfid, const char *bname, + void *gfid_req, dict_t *xdata) +{ + inode_t *parent = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t need_heal = _gf_false; + + parent = afr_inode_find(this, pargfid); + if (!parent) + goto out; + + frame = afr_frame_create(this, NULL); + if (!frame) + goto out; + + ret = afr_selfheal_name_unlocked_inspect(frame, this, parent, pargfid, + bname, &need_heal); + if (ret) + goto out; + + if (need_heal) { + ret = afr_selfheal_name_do(frame, this, parent, pargfid, bname, + gfid_req, xdata); + if (ret) + goto out; + } + + ret = 0; +out: + if (parent) + inode_unref(parent); + if (frame) + AFR_STACK_DESTROY(frame); + + return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 7c9bc81119c..48e6dbcfb18 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,36 +8,370 @@ cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEAL_H__ -#define __AFR_SELF_HEAL_H__ +#ifndef _AFR_SELFHEAL_H +#define _AFR_SELFHEAL_H -#include <sys/stat.h> +/* Perform fop on all UP subvolumes and wait for all callbacks to return */ -#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type) -#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type)) -#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid)) -#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size) +#define AFR_ONALL(frame, rfn, fop, args...) \ + do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + unsigned char *__child_up = alloca(__priv->child_count); \ + \ + memcpy(__child_up, __priv->child_up, \ + sizeof(*__child_up) * __priv->child_count); \ + __count = AFR_COUNT(__child_up, __priv->child_count); \ + \ + __local->barrier.waitfor = __count; \ + afr_local_replies_wipe(__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__child_up[__i]) \ + continue; \ + STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + } \ + syncbarrier_wait(&__local->barrier, __count); \ + } while (0) -#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) +/* Perform fop on all subvolumes represented by list[] array and wait + for all callbacks to return */ + +#define AFR_ONLIST(list, frame, rfn, fop, args...) \ + do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0; \ + int __count = 0; \ + unsigned char *__list = alloca(__priv->child_count); \ + \ + memcpy(__list, list, sizeof(*__list) * __priv->child_count); \ + __count = AFR_COUNT(__list, __priv->child_count); \ + __local->barrier.waitfor = __count; \ + afr_local_replies_wipe(__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__list[__i]) \ + continue; \ + STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + } \ + syncbarrier_wait(&__local->barrier, __count); \ + } while (0) + +#define AFR_SEQ(frame, rfn, fop, args...) \ + do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0; \ + \ + afr_local_replies_wipe(__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) \ + continue; \ + STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + syncbarrier_wait(&__local->barrier, 1); \ + } \ + } while (0) + +#define ALLOC_MATRIX(n, type) \ + ({ \ + int __i; \ + type **__ptr = alloca(n * sizeof(type *)); \ + \ + for (__i = 0; __i < n; __i++) \ + __ptr[__i] = alloca0(n * sizeof(type)); \ + __ptr; \ + }) + +#define IA_EQUAL(f, s, field) \ + (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0) + +#define SBRAIN_HEAL_NO_GO_MSG \ + "Failed to obtain replies from all bricks of " \ + "the replica (are they up?). Cannot resolve split-brain." +#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain" +#define SNO_BIGGER_FILE "No bigger file" +#define SNO_DIFF_IN_MTIME "No difference in mtime" +#define SUSE_SOURCE_BRICK_TO_HEAL \ + "Use source-brick option to heal metadata" \ + " split-brain" +#define SINVALID_BRICK_NAME "Invalid brick name" +#define SBRICK_IS_NOT_UP "Brick is not up" +#define SBRICK_NOT_CONNECTED "Brick is not connected" +#define SLESS_THAN2_BRICKS_in_REP "< 2 bricks in replica are up" +#define SBRICK_IS_REMOTE "Brick is remote" +#define SSTARTED_SELF_HEAL "Started self-heal" +#define SOP_NOT_SUPPORTED "Operation Not Supported" +#define SFILE_NOT_UNDER_DATA \ + "The file is not under data or metadata " \ + "split-brain" +#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain" +#define SALL_BRICKS_UP_TO_RESOLVE \ + "All the bricks should be up to resolve the" \ + " gfid split brain" +#define SERROR_GETTING_SRC_BRICK "Error getting the source brick" +int +afr_selfheal(xlator_t *this, uuid_t gfid); + +gf_boolean_t +afr_throttled_selfheal(call_frame_t *frame, xlator_t *this); + +int +afr_selfheal_name(xlator_t *this, uuid_t gfid, const char *name, void *gfid_req, + dict_t *xdata); + +int +afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd); + +int +afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name, + inode_t *inode, struct afr_reply *replies, int source, + unsigned char *sources, void *gfid, int *gfid_idx); + +int +afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); + +int +afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); + +int +afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, off_t off, + size_t size, unsigned char *locked_on); + +int +afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on); + +int +afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, const char *name, + unsigned char *locked_on); + +int +afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on, + dict_t *xdata); + +int +afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, + struct afr_reply *replies); + +int +afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on, dict_t *dict); +inode_t * +afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on, dict_t *xattr); + +int +afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + uint64_t *witness, unsigned char *flag); +int +afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx, + dict_t *xdata); + +int +afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix); + +int +afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata); + +int +afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *replies); +int +afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, struct afr_reply *replies, + unsigned char *locked_on); + +int +afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, + unsigned char *sources, inode_t *dir, + const char *name, inode_t *inode, + struct afr_reply *replies); + +int +afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr, dict_t *xdata); + +call_frame_t * +afr_frame_create(xlator_t *this, int32_t *op_errno); + +inode_t * +afr_inode_find(xlator_t *this, uuid_t gfid); + +int +afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf); +void +afr_reply_copy(struct afr_reply *dst, struct afr_reply *src); + +void +afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count); int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this); +afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry); + +unsigned int +afr_success_count(struct afr_reply *replies, unsigned int count); + +void +afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source, + unsigned char *sources, unsigned char *healed_sinks); + +void +afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies); +void +afr_mark_active_sinks(xlator_t *this, unsigned char *sources, + unsigned char *locked_on, unsigned char *sinks); + +gf_boolean_t +afr_dict_contains_heal_op(call_frame_t *frame); +gf_boolean_t +afr_can_decide_split_brain_source_sinks(struct afr_reply *replies, + int child_count); int -afr_self_heal_data (call_frame_t *frame, xlator_t *this); +afr_mark_split_brain_source_sinks( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type); int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); +afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies, + inode_t *inode, char **policy_str); int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr); +_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, + unsigned char *locked_on, + struct afr_reply *replies); int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); +afr_get_child_index_from_name(xlator_t *this, char *name); +gf_boolean_t +afr_does_witness_exist(xlator_t *this, uint64_t *witness); + +int +__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *flag); + +int +__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *flag); +int +__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + struct afr_reply *replies, int *source_p, + unsigned char *flag); + +int +afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **link_inode, gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal, + struct afr_reply *replies); + +int +afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid); + +int +afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +int +afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on); +int +afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, + afr_transaction_type type); + +int +afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf); + +int +afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode); +int +afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode); +int +afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode); + +int +afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, const char *bname, + int src_idx, int child_idx, + unsigned char *locked_on, int *src, dict_t *xdata); +int +afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type); + +gf_boolean_t +afr_is_file_empty_on_all_children(afr_private_t *priv, + struct afr_reply *replies); + +int +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies); int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, - dict_t **xattr, - afr_transaction_type txn_type, - uuid_t gfid); -#endif /* __AFR_SELF_HEAL_H__ */ +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode); +#endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index f33b04eed06..109fd4b7421 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,1266 +8,1709 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif #include "afr.h" -#include "syncop.h" +#include "afr-self-heal.h" #include "afr-self-heald.h" -#include "afr-self-heal-common.h" #include "protocol-common.h" -#include "event-history.h" - -typedef enum { - STOP_CRAWL_ON_SINGLE_SUBVOL = 1 -} afr_crawl_flags_t; - -typedef enum { - HEAL = 1, - INFO -} shd_crawl_op; - -typedef struct shd_dump { - dict_t *dict; - xlator_t *this; - int child; -} shd_dump_t; - -typedef struct shd_event_ { - int child; - char *path; -} shd_event_t; +#include <glusterfs/syncop-utils.h> +#include "afr-messages.h" +#include <glusterfs/byte-order.h> + +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 +#define AFR_STATISTICS_HISTORY_SIZE 50 + +#define ASSERT_LOCAL(this, healer) \ + if (!afr_shd_is_subvol_local(this, healer->subvol)) { \ + healer->local = _gf_false; \ + if (safe_break(healer)) { \ + break; \ + } else { \ + continue; \ + } \ + } else { \ + healer->local = _gf_true; \ + } + +#define NTH_INDEX_HEALER(this, n) \ + &((((afr_private_t *)this->private))->shd.index_healers[n]) +#define NTH_FULL_HEALER(this, n) \ + &((((afr_private_t *)this->private))->shd.full_healers[n]) + +char * +afr_subvol_name(xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; -typedef struct shd_pos_ { - int child; - xlator_t *this; - afr_child_pos_t pos; -} shd_pos_t; + priv = this->private; + if (subvol < 0 || subvol > priv->child_count) + return NULL; -typedef int -(*afr_crawl_done_cbk_t) (int ret, call_frame_t *sync_frame, void *crawl_data); + return priv->children[subvol]->name; +} void -afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, - process_entry_cbk_t process_entry, void *op_data, - gf_boolean_t exclusive, int crawl_flags, - afr_crawl_done_cbk_t crawl_done); - -static int -_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data); - -/* For calling straight through (e.g. already in a synctask). */ -int -afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos); - -/* For deferring through a new synctask. */ -int -afr_syncop_find_child_position (void *data); - -static int -_loc_assign_gfid_path (loc_t *loc) +afr_destroy_crawl_event_data(void *data) { - int ret = -1; - char gfid_path[64] = {0}; - - if (loc->inode && !uuid_is_null (loc->inode->gfid)) { - ret = inode_path (loc->inode, NULL, (char**)&loc->path); - } else if (!uuid_is_null (loc->gfid)) { - snprintf (gfid_path, sizeof (gfid_path), "<gfid:%s>", - uuid_utoa (loc->gfid)); - loc->path = gf_strdup (gfid_path); - if (loc->path) - ret = 0; - } - return ret; + return; } void -shd_cleanup_event (void *event) +afr_destroy_shd_event_data(void *data) { - shd_event_t *shd_event = event; + shd_event_t *shd_event = data; - if (!shd_event) - goto out; - GF_FREE (shd_event->path); - GF_FREE (shd_event); -out: + if (!shd_event) return; + GF_FREE(shd_event->path); + + return; } -int -afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count) +gf_boolean_t +afr_shd_is_subvol_local(xlator_t *this, int subvol) { - int i = 0; - int ret = -1; - for (i = 0; i < child_count; i++) { - if (shd->pos[i] == AFR_POS_LOCAL) { - ret = i; - break; - } - } - return ret; + afr_private_t *priv = NULL; + gf_boolean_t is_local = _gf_false; + loc_t loc = { + 0, + }; + + loc.inode = this->itable->root; + gf_uuid_copy(loc.gfid, loc.inode->gfid); + priv = this->private; + syncop_is_subvol_local(priv->children[subvol], &loc, &is_local); + return is_local; } -static int -_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) +int +__afr_shd_healer_wait(struct subvol_healer *healer) { - int ret = 0; - - uuid_copy (loc->pargfid, parent->inode->gfid); - loc->path = ""; - loc->name = name; - loc->parent = inode_ref (parent->inode); - if (!loc->parent) { - loc->path = NULL; - loc_wipe (loc); - ret = -1; - } - return ret; + afr_private_t *priv = NULL; + struct timespec wait_till = { + 0, + }; + int ret = 0; + + priv = healer->this->private; + +disabled_loop: + wait_till.tv_sec = gf_time() + priv->shd.timeout; + + while (!healer->rerun) { + ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till); + if (ret == ETIMEDOUT) + break; + } + + ret = healer->rerun; + healer->rerun = 0; + + if (!priv->shd.enabled) + goto disabled_loop; + + return ret; } int -_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, - struct timeval *tv, gf_boolean_t dyn) +afr_shd_healer_wait(struct subvol_healer *healer) { - //subkey not used for now - int ret = -1; - uint64_t count = 0; - char key[256] = {0}; - int xl_id = 0; + int ret = 0; - ret = dict_get_int32 (output, this->name, &xl_id); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); - goto out; - } + pthread_mutex_lock(&healer->mutex); + { + ret = __afr_shd_healer_wait(healer); + } + pthread_mutex_unlock(&healer->mutex); - snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); - ret = dict_get_uint64 (output, key, &count); + return ret; +} - snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); - if (dyn) - ret = dict_set_dynstr (output, key, path); - else - ret = dict_set_str (output, key, path); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", - path); - goto out; - } +gf_boolean_t +safe_break(struct subvol_healer *healer) +{ + gf_boolean_t ret = _gf_false; - if (!tv) - goto inc_count; - snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, - child, count); - ret = dict_set_uint32 (output, key, tv->tv_sec); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", - path); - goto out; - } + pthread_mutex_lock(&healer->mutex); + { + if (healer->rerun) + goto unlock; -inc_count: - snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); - ret = dict_set_uint64 (output, key, count + 1); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); - goto out; - } - ret = 0; -out: - return ret; + healer->running = _gf_false; + ret = _gf_true; + } +unlock: + pthread_mutex_unlock(&healer->mutex); + + return ret; } -int -_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child, - char **fpath, gf_boolean_t *missing) +inode_t * +afr_shd_inode_find(xlator_t *this, xlator_t *subvol, uuid_t gfid) { - dict_t *xattr = NULL; - char *path = NULL; - int ret = -1; - - ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY); - if (ret < 0) { - if ((errno == ENOENT) && missing) - *missing = _gf_true; - goto out; - } - ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get path for " - "gfid %s", uuid_utoa (child->gfid)); - goto out; - } - path = gf_strdup (path); - if (!path) { - ret = -1; - goto out; - } - ret = 0; + int ret = 0; + uint64_t val = IA_INVAL; + dict_t *xdata = NULL; + dict_t *rsp_dict = NULL; + inode_t *inode = NULL; + + xdata = dict_new(); + if (!xdata) + goto out; + + ret = dict_set_int8(xdata, GF_INDEX_IA_TYPE_GET_REQ, 1); + if (ret) + goto out; + + ret = syncop_inode_find(this, subvol, gfid, &inode, xdata, &rsp_dict); + if (ret < 0) + goto out; + + if (rsp_dict) { + ret = dict_get_uint64(rsp_dict, GF_INDEX_IA_TYPE_GET_RSP, &val); + if (ret) + goto out; + } + ret = inode_ctx_set2(inode, subvol, 0, &val); out: - if (!ret) - *fpath = path; - if (xattr) - dict_unref (xattr); - return ret; + if (ret && inode) { + inode_unref(inode); + inode = NULL; + } + if (xdata) + dict_unref(xdata); + if (rsp_dict) + dict_unref(rsp_dict); + return inode; } -int -_add_event_to_dict (circular_buffer_t *cb, void *data) +inode_t * +afr_shd_index_inode(xlator_t *this, xlator_t *subvol, char *vgfid) { - int ret = 0; - shd_dump_t *dump_data = NULL; - shd_event_t *shd_event = NULL; + loc_t rootloc = { + 0, + }; + inode_t *inode = NULL; + int ret = 0; + dict_t *xattr = NULL; + void *index_gfid = NULL; + + rootloc.inode = inode_ref(this->itable->root); + gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr(subvol, &rootloc, &xattr, vgfid, NULL, NULL); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr(xattr, vgfid, &index_gfid); + if (ret) + goto out; + + gf_msg_debug(this->name, 0, "%s dir gfid for %s: %s", vgfid, subvol->name, + uuid_utoa(index_gfid)); + + inode = afr_shd_inode_find(this, subvol, index_gfid); - dump_data = data; - shd_event = cb->data; - if (shd_event->child != dump_data->child) - goto out; - ret = _add_path_to_dict (dump_data->this, dump_data->dict, - dump_data->child, shd_event->path, &cb->tv, - _gf_false); out: - return ret; + loc_wipe(&rootloc); + + if (xattr) + dict_unref(xattr); + + return inode; } int -_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, + ia_type_t type) { - shd_dump_t dump_data = {0}; + int ret = 0; + loc_t loc = { + 0, + }; - dump_data.this = this; - dump_data.dict = dict; - dump_data.child = child; - eh_dump (eh, &dump_data, _add_event_to_dict); - return 0; + loc.parent = inode_ref(inode); + loc.name = name; + + if (IA_ISDIR(type)) + ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); + else + ret = syncop_unlink(subvol, &loc, NULL, NULL); + + loc_wipe(&loc); + return ret; } void -_remove_stale_index (xlator_t *this, xlator_t *readdir_xl, - loc_t *parent, char *fname) +afr_shd_zero_xattrop(xlator_t *this, uuid_t gfid) { - int ret = 0; - loc_t index_loc = {0}; - - ret = _build_index_loc (this, &index_loc, fname, parent); + call_frame_t *frame = NULL; + inode_t *inode = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int ret = 0; + int i = 0; + int raw[AFR_NUM_CHANGE_LOGS] = {0}; + + priv = this->private; + frame = afr_frame_create(this, NULL); + if (!frame) + goto out; + inode = afr_inode_find(this, gfid); + if (!inode) + goto out; + xattr = dict_new(); + if (!xattr) + goto out; + ret = dict_set_static_bin(xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); if (ret) - goto out; - gf_log (this->name, GF_LOG_DEBUG, "Removing stale index " - "for %s on %s", index_loc.name, readdir_xl->name); - ret = syncop_unlink (readdir_xl, &index_loc); - if(ret && (errno != ENOENT)) { - gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index " - "on %s - %s",index_loc.name, readdir_xl->name, - strerror (errno)); - } - index_loc.path = NULL; - loc_wipe (&index_loc); + goto out; + } + + /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or + * has valid repies for this gfid seems a bit of an overkill.*/ + for (i = 0; i < priv->child_count; i++) + afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + out: - return; + if (frame) + AFR_STACK_DESTROY(frame); + if (inode) + inode_unref(inode); + if (xattr) + dict_unref(xattr); + return; } int -_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, - loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +afr_shd_selfheal_name(struct subvol_healer *healer, int child, uuid_t parent, + const char *bname) { - dict_t *output = NULL; - xlator_t *readdir_xl = NULL; - int ret = -1; - char *path = NULL; - gf_boolean_t missing = _gf_false; - char gfid_str[64] = {0}; - - if (uuid_is_null (childloc->gfid)) - goto out; + int ret = -1; - output = crawl_data->op_data; - readdir_xl = crawl_data->readdir_xl; + ret = afr_selfheal_name(THIS, parent, bname, NULL, NULL); - ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path, - &missing); - if (ret == 0) { - ret = _add_path_to_dict (this, output, crawl_data->child, path, - NULL, _gf_true); - } else if (missing) { - _remove_stale_index (this, readdir_xl, parentloc, - uuid_utoa_r (childloc->gfid, gfid_str)); + return ret; +} + +int +afr_shd_selfheal(struct subvol_healer *healer, int child, uuid_t gfid) +{ + int ret = 0; + eh_t *eh = NULL; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + crawl_event_t *crawl_event = NULL; + + this = healer->this; + priv = this->private; + shd = &priv->shd; + crawl_event = &healer->crawl_event; + + subvol = priv->children[child]; + + // If this fails with ENOENT/ESTALE index is stale + ret = syncop_gfid_to_path(this->itable, subvol, gfid, &path); + if (ret < 0) + return ret; + + ret = afr_selfheal(this, gfid); + + LOCK(&priv->lock); + { + if (ret == -EIO) { + eh = shd->split_brain; + crawl_event->split_brain_count++; + } else if (ret < 0) { + crawl_event->heal_failed_count++; + } else if (ret == 0) { + crawl_event->healed_count++; } + } + UNLOCK(&priv->lock); + + if (eh) { + shd_event = GF_CALLOC(1, sizeof(*shd_event), gf_afr_mt_shd_event_t); + if (!shd_event) + goto out; + shd_event->child = child; + shd_event->path = path; + + if (eh_save_history(eh, shd_event) < 0) + goto out; + + shd_event = NULL; + path = NULL; + } out: - if (ret && path) - GF_FREE (path); - return ret; + GF_FREE(shd_event); + GF_FREE(path); + return ret; } void -_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, - int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, - afr_crawl_data_t *crawl_data) +afr_shd_sweep_prepare(struct subvol_healer *healer) { - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - eh_t *eh = NULL; - char *path = NULL; - char gfid_str[64] = {0}; - shd_event_t *event = NULL; - int32_t sh_failed = 0; - gf_boolean_t split_brain = 0; - int32_t actual_sh_done = 0; - priv = this->private; - shd = &priv->shd; - if (crawl_data->crawl == INDEX) { - if ((op_ret < 0) && (op_errno == ENOENT)) { - _remove_stale_index (this, crawl_data->readdir_xl, - parent, uuid_utoa_r (child->gfid, - gfid_str)); - goto out; - } - ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl, - child, &path, NULL); - if (ret) - goto out; - } else { - path = gf_strdup (child->path); - if (!path) { - ret = -1; - goto out; - } - } - - if (xattr_rsp) { - ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed); - ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); - } + crawl_event_t *event = NULL; - split_brain = afr_is_split_brain (this, child->inode); + event = &healer->crawl_event; - if ((op_ret < 0 && op_errno == EIO) || split_brain) { - eh = shd->split_brain; - } else if ((op_ret < 0) || sh_failed) { - eh = shd->heal_failed; - } else if (actual_sh_done == 1) { - eh = shd->healed; - } + event->healed_count = 0; + event->split_brain_count = 0; + event->heal_failed_count = 0; - ret = -1; + event->start_time = gf_time(); + event->end_time = 0; + _mask_cancellation(); +} - if (eh != NULL) { - event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t); - if (!event) - goto out; - event->child = crawl_data->child; - event->path = path; +void +afr_shd_sweep_done(struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; + crawl_event_t *history = NULL; + afr_self_heald_t *shd = NULL; - ret = eh_save_history (eh, event); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save " - "to event history, (%d, %s)", path, op_ret, - strerror (op_errno)); + event = &healer->crawl_event; + shd = &(((afr_private_t *)healer->this->private)->shd); - goto out; - } - } else { - gf_log (this->name, GF_LOG_DEBUG, "%s:Self heal already done ", - path); + event->end_time = gf_time(); + history = gf_memdup(event, sizeof(*event)); + event->start_time = 0; - } - ret = 0; -out: - if (ret && path) - GF_FREE (path); + if (!history) return; + + if (eh_save_history(shd->statistics[healer->subvol], history) < 0) + GF_FREE(history); + _unmask_cancellation(); } int -_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) +afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) { - inode_t *link_inode = NULL; - int ret = -1; + struct subvol_healer *healer = data; + afr_private_t *priv = NULL; + uuid_t gfid = {0}; + int ret = 0; + uint64_t val = IA_INVAL; - link_inode = inode_link (loc->inode, NULL, NULL, iattr); - if (link_inode == NULL) { - gf_log (this->name, GF_LOG_ERROR, "inode link failed " - "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); - goto out; - } - inode_unref (loc->inode); - loc->inode = link_inode; - ret = 0; -out: - return ret; -} + priv = healer->this->private; + if (!priv->shd.enabled) + return -EBUSY; -int -_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, - loc_t *child, loc_t *parent, struct iatt *iattr) -{ - struct iatt parentbuf = {0}; - int ret = 0; - dict_t *xattr_rsp = NULL; + gf_msg_debug(healer->this->name, 0, "got entry: %s from %s", entry->d_name, + priv->children[healer->subvol]->name); - gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); + ret = gf_uuid_parse(entry->d_name, gfid); + if (ret) + return 0; - ret = syncop_lookup (this, child, NULL, - iattr, &xattr_rsp, &parentbuf); - _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp, - crawl_data); - if (xattr_rsp) - dict_unref (xattr_rsp); - if (ret == 0) - ret = _link_inode_update_loc (this, child, iattr); + inode_ctx_get2(parent->inode, subvol, NULL, &val); - return ret; -} + ret = afr_shd_selfheal(healer, healer->subvol, gfid); -static int -afr_crawl_done (int ret, call_frame_t *sync_frame, void *data) -{ - GF_FREE (data); - STACK_DESTROY (sync_frame->root); - return 0; -} + if (ret == -ENOENT || ret == -ESTALE) + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val); -void -_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl) -{ - afr_start_crawl (this, child, crawl, _self_heal_entry, - NULL, _gf_true, STOP_CRAWL_ON_SINGLE_SUBVOL, - afr_crawl_done); + if (ret == 2) + /* If bricks crashed in pre-op after creating indices/xattrop + * link but before setting afr changelogs, we end up with stale + * xattrop links but zero changelogs. Remove such entries by + * sending a post-op with zero changelogs. + */ + afr_shd_zero_xattrop(healer->this, gfid); + + return 0; } -gf_boolean_t -_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason) +int +afr_shd_index_sweep(struct subvol_healer *healer, char *vgfid) { - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - gf_boolean_t proceed = _gf_false; - char *msg = NULL; - - priv = this->private; - shd = &priv->shd; - if (!shd->enabled) { - msg = "Self-heal daemon is not enabled"; - gf_log (this->name, GF_LOG_DEBUG, "%s", msg); - goto out; - } - if (!priv->child_up[child]) { - gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , " - "subvol went down", priv->children[child]->name); - msg = "Brick is Not connected"; - goto out; - } + loc_t loc = {0}; + afr_private_t *priv = NULL; + int ret = 0; + xlator_t *subvol = NULL; + dict_t *xdata = NULL; + call_frame_t *frame = NULL; + + priv = healer->this->private; + subvol = priv->children[healer->subvol]; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + loc.inode = afr_shd_index_inode(healer->this, subvol, vgfid); + if (!loc.inode) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_INDEX_DIR_GET_FAILED, "unable to get index-dir on %s", + subvol->name); + ret = -errno; + goto out; + } + + xdata = dict_new(); + if (!xdata || dict_set_int32_sizen(xdata, "get-gfid-type", 1)) { + ret = -ENOMEM; + goto out; + } + + ret = syncop_mt_dir_scan(frame, subvol, &loc, GF_CLIENT_PID_SELF_HEALD, + healer, afr_shd_index_heal, xdata, + priv->shd.max_threads, priv->shd.wait_qlength); + + if (ret == 0) + ret = healer->crawl_event.healed_count; - if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) { - if (afr_up_children_count (priv->child_up, - priv->child_count) < 2) { - gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as " - "< 2 children are up"); - msg = "< 2 bricks in replica are running"; - goto out; - } - } - proceed = _gf_true; out: - if (reason) - *reason = msg; - return proceed; + loc_wipe(&loc); + + if (xdata) + dict_unref(xdata); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; } int -_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, - shd_crawl_op op, dict_t *output) +afr_shd_index_sweep_all(struct subvol_healer *healer) { - afr_private_t *priv = NULL; - char *status = NULL; - char *subkey = NULL; - char key[256] = {0}; - shd_pos_t pos_data = {0}; - int op_ret = -1; - int xl_id = -1; - int i = 0; - int ret = 0; - int crawl_flags = 0; - - priv = this->private; - if (op == HEAL) - crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL; - - if (output) { - ret = dict_get_int32 (output, this->name, &xl_id); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Invalid input, " - "translator-id is not available"); - goto out; - } - } - pos_data.this = this; - subkey = "status"; - for (i = 0; i < priv->child_count; i++) { - if (_crawl_proceed (this, i, crawl_flags, &status)) { - pos_data.child = i; - /* - * We're already in a synctask in this case, so we - * don't need to defer through a second (and in fact - * that can cause deadlock). Just call straight - * through instead. - */ - ret = afr_find_child_position(pos_data.this, - pos_data.child, - &pos_data.pos); - if (ret) { - status = "Not able to find brick location"; - } else if (pos_data.pos == AFR_POS_REMOTE) { - status = "brick is remote"; - } else { - op_ret = 0; - if (op == HEAL) { - status = "Started self-heal"; - _do_self_heal_on_subvol (this, i, - crawl); - } else if (output) { - status = ""; - afr_start_crawl (this, i, INDEX, - _add_summary_to_dict, - output, _gf_false, 0, - NULL); - } - } - if (output) { - snprintf (key, sizeof (key), "%d-%d-%s", xl_id, - i, subkey); - ret = dict_set_str (output, key, status); - } - if (!op_ret && (crawl == FULL)) - break; - } - if (output) { - snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i, - subkey); - ret = dict_set_str (output, key, status); - } - } + int ret = 0; + int count = 0; + + ret = afr_shd_index_sweep(healer, GF_XATTROP_INDEX_GFID); + if (ret < 0) + goto out; + count = ret; + + ret = afr_shd_index_sweep(healer, GF_XATTROP_DIRTY_GFID); + if (ret < 0) + goto out; + count += ret; + + ret = afr_shd_index_sweep(healer, GF_XATTROP_ENTRY_CHANGES_GFID); + if (ret < 0) + goto out; + count += ret; out: - return op_ret; + if (ret < 0) + return ret; + else + return count; } int -_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, - dict_t *output) +afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) { - return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output); + struct subvol_healer *healer = data; + xlator_t *this = healer->this; + afr_private_t *priv = NULL; + + priv = this->private; + + if (this->cleanup_starting) { + return -ENOTCONN; + } + + if (!priv->shd.enabled) + return -EBUSY; + + afr_shd_selfheal_name(healer, healer->subvol, parent->inode->gfid, + entry->d_name); + + afr_shd_selfheal(healer, healer->subvol, entry->d_stat.ia_gfid); + + return 0; } int -_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) +afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode) { - return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); + afr_private_t *priv = NULL; + loc_t loc = {0}; + + priv = healer->this->private; + loc.inode = inode; + return syncop_ftw(priv->children[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal); } int -_add_all_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) +afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc) { - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int i = 0; + afr_private_t *priv = NULL; + struct iatt stbuf = { + 0, + }; + int ret = -1; + + priv = this->private; + loc->parent = inode_ref(this->itable->root); + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + loc->inode = inode_new(loc->parent->table); + GF_CHECK_ALLOC(loc->inode, ret, out); + + if (!gf_uuid_is_null(priv->ta_gfid)) + goto assign_gfid; + + ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf, + 0, 0, 0); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed lookup on file %s.", loc->name); + goto out; + } + + gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); + +assign_gfid: + gf_uuid_copy(loc->gfid, priv->ta_gfid); + ret = 0; - priv = this->private; - shd = &priv->shd; +out: + if (ret) + loc_wipe(loc); - for (i = 0; i < priv->child_count; i++) { - if (shd->pos[i] != AFR_POS_LOCAL) - continue; - _add_eh_to_dict (this, eh, dict, i); - } - return 0; + return ret; } int -afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) +_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) { - gf_xl_afr_op_t op = GF_AFR_OP_INVALID; - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int xl_id = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = { + 0, + }; + int ret = -1; + int i = 0; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED, + "Failed to create dict."); + goto out; + } + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) + goto out; + } - priv = this->private; - shd = &priv->shd; + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL); + if (ret || !(*xdata)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Xattrop failed on %s.", loc->name); + } - ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); - if (ret) - goto out; - ret = dict_get_int32 (input, this->name, &xl_id); - if (ret) - goto out; - ret = dict_set_int32 (output, this->name, xl_id); - if (ret) - goto out; - switch (op) { - case GF_AFR_OP_HEAL_INDEX: - ret = _do_self_heal_on_local_subvols (this, INDEX, output); - break; - case GF_AFR_OP_HEAL_FULL: - ret = _do_self_heal_on_local_subvols (this, FULL, output); - break; - case GF_AFR_OP_INDEX_SUMMARY: - (void)_get_index_summary_on_local_subvols (this, output); - ret = 0; - break; - case GF_AFR_OP_HEALED_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->healed, output); - break; - case GF_AFR_OP_HEAL_FAILED_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->heal_failed, - output); - break; - case GF_AFR_OP_SPLIT_BRAIN_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->split_brain, - output); - break; - default: - gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); - break; - } out: - dict_del (output, this->name); - return ret; + if (xattr) + dict_unref(xattr); + + return ret; } void -afr_poll_self_heal (void *data) +afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer, + dict_t **xdata) { - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - struct timeval timeout = {0}; - xlator_t *this = NULL; - long child = (long)data; - gf_timer_t *old_timer = NULL; - gf_timer_t *new_timer = NULL; - shd_pos_t pos_data = {0}; - int ret = 0; - - this = THIS; - priv = this->private; - shd = &priv->shd; - - if (shd->pos[child] == AFR_POS_UNKNOWN) { - pos_data.this = this; - pos_data.child = child; - ret = synctask_new (this->ctx->env, - afr_syncop_find_child_position, - NULL, NULL, &pos_data); - if (!ret) - shd->pos[child] = pos_data.pos; - } - if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) - _do_self_heal_on_subvol (this, child, INDEX); - timeout.tv_sec = shd->timeout; - timeout.tv_usec = 0; - //notify and previous timer should be synchronized. - LOCK (&priv->lock); - { - old_timer = shd->timer[child]; - if (shd->pos[child] == AFR_POS_REMOTE) - goto unlock; - shd->timer[child] = gf_timer_call_after (this->ctx, timeout, - afr_poll_self_heal, - data); - new_timer = shd->timer[child]; - } -unlock: - UNLOCK (&priv->lock); - - if (old_timer) - gf_timer_call_cancel (this->ctx, old_timer); - if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) { - gf_log (this->name, GF_LOG_WARNING, - "Could not create self-heal polling timer for %s", - priv->children[child]->name); + int ret = 0; + + loc_wipe(loc); + if (afr_shd_fill_ta_loc(this, loc)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc->name); + ret = -1; + goto out; + } + + ret = afr_ta_post_op_lock(this, loc); + if (ret) + goto out; + + ret = _afr_shd_ta_get_xattrs(this, loc, xdata); + if (ret) { + if (*xdata) { + dict_unref(*xdata); + *xdata = NULL; } - return; + } + + afr_ta_post_op_unlock(this, loc); + +out: + if (ret) + healer->rerun = 1; } -static int -afr_handle_child_up (int ret, call_frame_t *sync_frame, void *data) +int +afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) { - afr_self_heald_t *shd = NULL; - shd_pos_t *pos_data = data; - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + gf_boolean_t need_xattrop = _gf_false; + void *pending_raw = NULL; + int *raw = NULL; + int pending[AFR_NUM_CHANGE_LOGS] = { + 0, + }; + int i = 0; + int j = 0; + int val = 0; + int ret = -1; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); + if (!raw) { + goto out; + } - if (ret) - goto out; + ret = dict_get_ptr(*xdata, priv->pending_key[i], &pending_raw); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "Error getting value " + "of pending key %s", + priv->pending_key[i]); + GF_FREE(raw); + goto out; + } + + memcpy(pending, pending_raw, sizeof(pending)); + for (j = 0; j < AFR_NUM_CHANGE_LOGS; j++) { + val = ntoh32(pending[j]); + if (val) { + if (i == healer) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_THIN_ARB, + "I am " + "not the good shd. Skipping. " + "SHD = %d.", + healer); + ret = 0; + GF_FREE(raw); + goto out; + } + need_xattrop = _gf_true; + raw[j] = hton32(-val); + } + } + + ret = dict_set_bin(xattr, priv->pending_key[i], raw, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + GF_FREE(raw); + goto out; + } + + if (need_xattrop) + break; + } + + if (!need_xattrop) { + ret = 0; + goto out; + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Xattrop failed."); - priv = pos_data->this->private; - shd = &priv->shd; - shd->pos[pos_data->child] = pos_data->pos; - if (pos_data->pos != AFR_POS_REMOTE) - afr_poll_self_heal ((void*)(long)pos_data->child); - _do_self_heal_on_local_subvols (THIS, INDEX, NULL); out: - GF_FREE (data); - return 0; + if (xattr) + dict_unref(xattr); + + return ret; } void -afr_proactive_self_heal (void *data) +afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc, + struct subvol_healer *healer, + dict_t *pre_crawl_xdata) { - xlator_t *this = NULL; - long child = (long)data; - shd_pos_t *pos_data = NULL; - int ret = 0; + int ret_lock = 0; + int ret = 0; + dict_t *post_crawl_xdata = NULL; - this = THIS; + ret_lock = afr_ta_post_op_lock(this, loc); + if (ret_lock) + goto unref; - //Position of brick could have changed and it could be local now. - //Compute the position again - pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t); - if (!pos_data) - goto out; - pos_data->this = this; - pos_data->child = child; - ret = synctask_new (this->ctx->env, afr_syncop_find_child_position, - afr_handle_child_up, NULL, pos_data); - if (ret) - goto out; -out: - return; + ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata); + if (ret) + goto unref; + + if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) { + ret = -1; + goto unref; + } + + ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol); + +unref: + if (post_crawl_xdata) { + dict_unref(post_crawl_xdata); + post_crawl_xdata = NULL; + } + + if (ret || ret_lock) + healer->rerun = 1; + + if (!ret_lock) + afr_ta_post_op_unlock(this, loc); } -static int -get_pathinfo_host (char *pathinfo, char *hostname, size_t size) +gf_boolean_t +afr_bricks_available_for_heal(afr_private_t *priv) { - char *start = NULL; - char *end = NULL; - int ret = -1; - int i = 0; + int up_children = 0; - if (!pathinfo) - goto out; + up_children = __afr_get_up_children_count(priv); + if (up_children < 2) { + return _gf_false; + } + return _gf_true; +} - start = strchr (pathinfo, ':'); - if (!start) - goto out; - end = strrchr (pathinfo, ':'); - if (start == end) - goto out; +static gf_boolean_t +afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer) +{ + dict_t *xdata = NULL; + afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + int ret = -1; + int i = 0; + gf_boolean_t need_heal = _gf_false; + + priv = this->private; + + ret = afr_shd_fill_ta_loc(this, &loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + healer->rerun = 1; + goto out; + } + + if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) { + healer->rerun = 1; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) { + need_heal = _gf_true; + break; + } + } - memset (hostname, 0, size); - i = 0; - while (++start != end) - hostname[i++] = *start; - ret = 0; out: - return ret; + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + + return need_heal; } -int -afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) +static int +afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) { - int ret = 0; - char pathinfohost[1024] = {0}; - char localhost[1024] = {0}; - xlator_t *this = THIS; - - *local = _gf_false; - ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", - pathinfo); + struct subvol_healer *healer = data; + afr_private_t *priv = healer->this->private; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + loc_t loc = {0}; + int count = 0; + int i = 0; + int op_errno = 0; + struct iatt *iatt = NULL; + gf_boolean_t multiple_links = _gf_false; + unsigned char *gfid_present = alloca0(priv->child_count); + unsigned char *entry_present = alloca0(priv->child_count); + char *type = "file"; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + gf_msg_debug(healer->this->name, 0, + "Not all bricks are up. Skipping " + "cleanup of %s on %s", + entry->d_name, subvol->name); + ret = 0; + goto out; + } + + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + ret = gf_uuid_parse(entry->d_name, loc.gfid); + if (ret) { + ret = 0; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + gfid_present[i] = 1; + iatt = &local->replies[i].poststat; + if (iatt->ia_type == IA_IFDIR) { + type = "dir"; + } + + if (i == healer->subvol) { + if (local->replies[i].poststat.ia_nlink > 1) { + multiple_links = _gf_true; + } + } + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + /*Inode is deleted from subvol*/ + if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type, + priv->anon_inode_name, entry->d_name, subvol->name); + ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name, + iatt->ia_type); + if (ret == -ENOENT || ret == -ESTALE) + ret = 0; + } else if (count > 1) { + loc_wipe(&loc); + loc.parent = inode_ref(parent->inode); + loc.name = entry->d_name; + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, + &loc, NULL); + count = 0; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + entry_present[i] = 1; + iatt = &local->replies[i].poststat; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; goto out; + } } - - ret = gethostname (localhost, sizeof (localhost)); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " - "reason: %s", strerror (errno)); + for (i = 0; i < priv->child_count; i++) { + if (gfid_present[i] && !entry_present[i]) { + /*Entry is not anonymous on at least one subvol*/ + gf_msg_debug(healer->this->name, 0, + "Valid entry present on %s " + "Skipping cleanup of %s on %s", + priv->children[i]->name, entry->d_name, + subvol->name); + ret = 0; goto out; + } } - if (!strcmp (localhost, pathinfohost)) - *local = _gf_true; + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging %s %s/%s on all subvols", type, priv->anon_inode_name, + entry->d_name); + ret = 0; + for (i = 0; i < priv->child_count; i++) { + op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent, + entry->d_name, iatt->ia_type); + if (op_errno != ENOENT && op_errno != ESTALE) { + ret |= -op_errno; + } + } + } + out: - return ret; + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return ret; } -int -afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, - loc_t *dirloc) +static void +afr_cleanup_anon_inode_dir(struct subvol_healer *healer) { - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - void *index_gfid = NULL; - loc_t rootloc = {0}; - struct iatt iattr = {0}; - struct iatt parent = {0}; - int ret = 0; - xlator_t *readdir_xl = crawl_data->readdir_xl; - - priv = this->private; - if (crawl_data->crawl == FULL) { - afr_build_root_loc (this, dirloc); - } else { - afr_build_root_loc (this, &rootloc); - ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, - GF_XATTROP_INDEX_GFID); - if (ret < 0) - goto out; - ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "failed to get index " - "dir gfid on %s", readdir_xl->name); - goto out; - } - if (!index_gfid) { - gf_log (this->name, GF_LOG_ERROR, "index gfid empty " - "on %s", readdir_xl->name); - ret = -1; - goto out; - } - uuid_copy (dirloc->gfid, index_gfid); - dirloc->path = ""; - dirloc->inode = inode_new (priv->root_inode->table); - ret = syncop_lookup (readdir_xl, dirloc, NULL, - &iattr, NULL, &parent); - if (ret < 0) { - if (errno != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, "lookup " - "failed on index dir on %s - (%s)", - readdir_xl->name, strerror (errno)); - } - goto out; - } - ret = _link_inode_update_loc (this, dirloc, &iattr); - if (ret) - goto out; - } - ret = 0; + int ret = 0; + call_frame_t *frame = NULL; + afr_private_t *priv = healer->this->private; + loc_t loc = {0}; + + ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode); + if (ret) + goto out; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, + afr_shd_anon_inode_cleaner, NULL, + priv->shd.max_threads, priv->shd.wait_qlength); out: - if (xattr) - dict_unref (xattr); - loc_wipe (&rootloc); - return ret; + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return; } -int -afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd, - loc_t *dirloc) +void * +afr_shd_index_healer(void *data) { - fd_t *fd = NULL; - int ret = 0; - - if (crawl_data->crawl == FULL) { - fd = fd_create (dirloc->inode, crawl_data->pid); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to create fd for %s", dirloc->path); - ret = -1; - goto out; - } + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int ret = 0; + afr_private_t *priv = NULL; + dict_t *pre_crawl_xdata = NULL; + loc_t loc = { + 0, + }; + + healer = data; + THIS = this = healer->this; + priv = this->private; + + for (;;) { + afr_shd_healer_wait(healer); + + if (!afr_bricks_available_for_heal(priv)) + continue; + + ASSERT_LOCAL(this, healer); + priv->local[healer->subvol] = healer->local; + + if (priv->thin_arbiter_count) { + if (afr_shd_ta_needs_heal(this, healer)) + afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata); + } - ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "opendir failed on %s", dirloc->path); - goto out; - } - } else { - fd = fd_anonymous (dirloc->inode); + do { + gf_msg_debug(this->name, 0, "starting index sweep on subvol %s", + afr_subvol_name(this, healer->subvol)); + + afr_shd_sweep_prepare(healer); + + ret = afr_shd_index_sweep_all(healer); + + afr_shd_sweep_done(healer); + /* + As long as at least one gfid was + healed, keep retrying. We may have + just healed a directory and thereby + created entries for other gfids which + could not be healed thus far. + */ + + gf_msg_debug(this->name, 0, "finished index sweep on subvol %s", + afr_subvol_name(this, healer->subvol)); + /* + Give a pause before retrying to avoid a busy loop + in case the only entry in index is because of + an ongoing I/O. + */ + sleep(1); + } while (ret > 0); + + if (ret == 0) { + afr_cleanup_anon_inode_dir(healer); } - ret = 0; -out: - if (!ret) - *dirfd = fd; - return ret; + + if (ret == 0 && pre_crawl_xdata && + !healer->crawl_event.heal_failed_count) { + afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, + pre_crawl_xdata); + } + + if (pre_crawl_xdata) { + dict_unref(pre_crawl_xdata); + pre_crawl_xdata = NULL; + } + } + + return NULL; } -xlator_t* -afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data) +void * +afr_shd_full_healer(void *data) { - afr_private_t *priv = this->private; + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int run = 0; - if (crawl_data->crawl == FULL) { - return this; - } else { - return priv->children[crawl_data->child]; + healer = data; + THIS = this = healer->this; + + for (;;) { + pthread_mutex_lock(&healer->mutex); + { + run = __afr_shd_healer_wait(healer); + if (!run) + healer->running = _gf_false; } - return NULL; + pthread_mutex_unlock(&healer->mutex); + + if (!run) + break; + + ASSERT_LOCAL(this, healer); + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "starting full sweep on subvol %s", + afr_subvol_name(this, healer->subvol)); + + afr_shd_sweep_prepare(healer); + + afr_shd_full_sweep(healer, this->itable->root); + + afr_shd_sweep_done(healer); + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "finished full sweep on subvol %s", + afr_subvol_name(this, healer->subvol)); + } + + return NULL; } int -afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, - gf_dirent_t *entry, afr_crawl_data_t *crawl_data) +afr_shd_healer_init(xlator_t *this, struct subvol_healer *healer) { - int ret = -1; - afr_private_t *priv = NULL; + int ret = 0; - priv = this->private; - if (crawl_data->crawl == FULL) { - ret = afr_build_child_loc (this, child, parent, entry->d_name); - } else { - child->inode = inode_new (priv->root_inode->table); - if (!child->inode) - goto out; - uuid_parse (entry->d_name, child->gfid); - ret = _loc_assign_gfid_path (child); - } + ret = pthread_mutex_init(&healer->mutex, NULL); + if (ret) + goto out; + + ret = pthread_cond_init(&healer->cond, NULL); + if (ret) + goto out; + + healer->this = this; + healer->running = _gf_false; + healer->rerun = _gf_false; + healer->local = _gf_false; out: - return ret; + return ret; } -static int -_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, - off_t *offset, afr_crawl_data_t *crawl_data) +int +afr_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer, + void *(threadfn)(void *)) { - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - int ret = 0; - loc_t entry_loc = {0}; - fd_t *fd = NULL; - struct iatt iattr = {0}; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if (!_crawl_proceed (this, crawl_data->child, - crawl_data->crawl_flags, NULL)) { - ret = -1; - goto out; - } - *offset = entry->d_off; - if (IS_ENTRY_CWD (entry->d_name) || - IS_ENTRY_PARENT (entry->d_name)) - continue; - if ((crawl_data->crawl == FULL) && - uuid_is_null (entry->d_stat.ia_gfid)) { - gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " - "gfid present skipping", - parentloc->path, entry->d_name); - continue; - } + int ret = 0; - loc_wipe (&entry_loc); - ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc, - entry, crawl_data); - if (ret) - goto out; - - ret = crawl_data->process_entry (this, crawl_data, entry, - &entry_loc, parentloc, &iattr); - - if (ret) - continue; - - if (crawl_data->crawl == INDEX) - continue; - - if (!IA_ISDIR (iattr.ia_type)) - continue; - fd = NULL; - ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc); - if (ret) - continue; - ret = _crawl_directory (fd, &entry_loc, crawl_data); - if (fd) - fd_unref (fd); + pthread_mutex_lock(&healer->mutex); + { + if (healer->running) { + pthread_cond_signal(&healer->cond); + } else { + ret = gf_thread_create(&healer->thread, NULL, threadfn, healer, + "shdheal"); + if (ret) + goto unlock; + healer->running = 1; } - ret = 0; -out: - loc_wipe (&entry_loc); - return ret; -} -static int -_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) -{ - xlator_t *this = NULL; - off_t offset = 0; - gf_dirent_t entries; - int ret = 0; - gf_boolean_t free_entries = _gf_false; - xlator_t *readdir_xl = crawl_data->readdir_xl; - - INIT_LIST_HEAD (&entries.list); - this = THIS; - - GF_ASSERT (loc->inode); - - if (crawl_data->crawl == FULL) - gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); - else - gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s", - uuid_utoa (loc->gfid)); - - while (1) { - if (crawl_data->crawl == FULL) - ret = syncop_readdirp (readdir_xl, fd, 131072, offset, - NULL, &entries); - else - ret = syncop_readdir (readdir_xl, fd, 131072, offset, - &entries); - if (ret <= 0) - break; - ret = 0; - free_entries = _gf_true; + healer->rerun = 1; + } +unlock: + pthread_mutex_unlock(&healer->mutex); - if (!_crawl_proceed (this, crawl_data->child, - crawl_data->crawl_flags, NULL)) { - ret = -1; - goto out; - } - if (list_empty (&entries.list)) - goto out; + return ret; +} - ret = _process_entries (this, loc, &entries, &offset, - crawl_data); - gf_dirent_free (&entries); - free_entries = _gf_false; - } - ret = 0; -out: - if (free_entries) - gf_dirent_free (&entries); - return ret; +int +afr_shd_full_healer_spawn(xlator_t *this, int subvol) +{ + return afr_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol), + afr_shd_full_healer); } -static char* -position_str_get (afr_child_pos_t pos) +int +afr_shd_index_healer_spawn(xlator_t *this, int subvol) { - switch (pos) { - case AFR_POS_UNKNOWN: - return "unknown"; - case AFR_POS_LOCAL: - return "local"; - case AFR_POS_REMOTE: - return "remote"; - } - return NULL; + return afr_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol), + afr_shd_index_healer); } int -afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos) +afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output, + crawl_event_t *crawl_event) { - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - dict_t *xattr_rsp = NULL; - loc_t loc = {0}; - int ret = 0; - char *node_uuid = NULL; - - priv = this->private; - shd = &priv->shd; - - afr_build_root_loc (this, &loc); - - ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp, - GF_XATTR_NODE_UUID_KEY); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - " - "(%s)", priv->children[child]->name, strerror (errno)); - goto out; - } + int ret = 0; + uint64_t count = 0; + char key[128] = {0}; + int keylen = 0; + char suffix[64] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = 0; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + int child = -1; + + child = crawl_event->child; + healed_count = crawl_event->healed_count; + split_brain_count = crawl_event->split_brain_count; + heal_failed_count = crawl_event->heal_failed_count; + crawl_type = crawl_event->crawl_type; + + if (!crawl_event->start_time) + goto out; + + start_time_str = gf_strdup(ctime(&crawl_event->start_time)); + + if (crawl_event->end_time) + end_time_str = gf_strdup(ctime(&crawl_event->end_time)); + + ret = dict_get_int32(output, this->name, &xl_id); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "xl does not have id"); + goto out; + } + + snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64(output, key, &count); + + snprintf(suffix, sizeof(suffix), "%d-%d-%" PRIu64, xl_id, child, count); + snprintf(key, sizeof(key), "statistics_healed_cnt-%s", suffix); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_healed_count to output"); + goto out; + } + + snprintf(key, sizeof(key), "statistics_sb_cnt-%s", suffix); + ret = dict_set_uint64(output, key, split_brain_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_split_brain_count to output"); + goto out; + } + + keylen = snprintf(key, sizeof(key), "statistics_crawl_type-%s", suffix); + ret = dict_set_strn(output, key, keylen, crawl_type); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_crawl_type to output"); + goto out; + } + + snprintf(key, sizeof(key), "statistics_heal_failed_cnt-%s", suffix); + ret = dict_set_uint64(output, key, heal_failed_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_healed_failed_count to output"); + goto out; + } + + keylen = snprintf(key, sizeof(key), "statistics_strt_time-%s", suffix); + ret = dict_set_dynstrn(output, key, keylen, start_time_str); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_crawl_start_time to output"); + goto out; + } else { + start_time_str = NULL; + } + + if (!end_time_str) + progress = 1; + else + progress = 0; + + keylen = snprintf(key, sizeof(key), "statistics_end_time-%s", suffix); + if (!end_time_str) + end_time_str = gf_strdup("Could not determine the end time"); + ret = dict_set_dynstrn(output, key, keylen, end_time_str); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_crawl_end_time to output"); + goto out; + } else { + end_time_str = NULL; + } + + keylen = snprintf(key, sizeof(key), "statistics_inprogress-%s", suffix); + + ret = dict_set_int32n(output, key, keylen, progress); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_inprogress to output"); + goto out; + } + + snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child); + ret = dict_set_uint64(output, key, count + 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not increment the counter."); + goto out; + } +out: + GF_FREE(start_time_str); + GF_FREE(end_time_str); + return ret; +} - ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid); +int +afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path, + struct timeval *tv) +{ + int ret = -1; + uint64_t count = 0; + char key[64] = {0}; + int keylen = 0; + char xl_id_child_str[32] = {0}; + int xl_id = 0; + + ret = dict_get_int32(output, this->name, &xl_id); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "xl does not have id"); + goto out; + } + + snprintf(xl_id_child_str, sizeof(xl_id_child_str), "%d-%d", xl_id, child); + snprintf(key, sizeof(key), "%s-count", xl_id_child_str); + ret = dict_get_uint64(output, key, &count); + + keylen = snprintf(key, sizeof(key), "%s-%" PRIu64, xl_id_child_str, count); + ret = dict_set_dynstrn(output, key, keylen, path); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Could not add to output", path); + goto out; + } + + if (tv) { + snprintf(key, sizeof(key), "%s-%" PRIu64 "-time", xl_id_child_str, + count); + ret = dict_set_uint32(output, key, tv->tv_sec); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on " - "child %s", priv->children[child]->name); - goto out; + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Could not set time", path); + goto out; } + } + + snprintf(key, sizeof(key), "%s-count", xl_id_child_str); - if (!strcmp (node_uuid, shd->node_uuid)) - *pos = AFR_POS_LOCAL; - else - *pos = AFR_POS_REMOTE; + ret = dict_set_uint64(output, key, count + 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not increment count"); + goto out; + } - gf_log (this->name, GF_LOG_DEBUG, "child %s is %s", - priv->children[child]->name, position_str_get (*pos)); + ret = 0; out: - if (ret) - *pos = AFR_POS_UNKNOWN; - loc_wipe (&loc); - return ret; + return ret; } int -afr_syncop_find_child_position (void *data) +afr_add_shd_event(circular_buffer_t *cb, void *data) { - shd_pos_t *pos_data = data; - int ret = 0; + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + shd_event = cb->data; + + if (!shd->index_healers[shd_event->child].local) + return 0; - ret = afr_find_child_position (pos_data->this, pos_data->child, - &pos_data->pos); - return ret; + path = gf_strdup(shd_event->path); + if (!path) + return -ENOMEM; + + afr_shd_dict_add_path(this, output, shd_event->child, path, &cb->tv); + return 0; } -static int -afr_dir_crawl (void *data) +int +afr_add_crawl_event(circular_buffer_t *cb, void *data) { - xlator_t *this = NULL; - int ret = -1; - xlator_t *readdir_xl = NULL; - fd_t *fd = NULL; - loc_t dirloc = {0}; - afr_crawl_data_t *crawl_data = data; - - this = THIS; + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + crawl_event_t *crawl_event = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + crawl_event = cb->data; + + if (!shd->index_healers[crawl_event->child].local) + return 0; - if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags, - NULL)) - goto out; + afr_shd_dict_add_crawl_event(this, output, crawl_event); - readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data); - if (!readdir_xl) - goto out; - crawl_data->readdir_xl = readdir_xl; + return 0; +} - ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc); +int +afr_selfheal_daemon_init(xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + shd = &priv->shd; + + shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->index_healers) + goto out; + + for (i = 0; i < priv->child_count; i++) { + shd->index_healers[i].subvol = i; + ret = afr_shd_healer_init(this, &shd->index_healers[i]); if (ret) - goto out; - - ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); + goto out; + } + + shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->full_healers) + goto out; + for (i = 0; i < priv->child_count; i++) { + shd->full_healers[i].subvol = i; + ret = afr_shd_healer_init(this, &shd->full_healers[i]); if (ret) - goto out; - - ret = _crawl_directory (fd, &dirloc, crawl_data); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s", - readdir_xl->name); - else - gf_log (this->name, GF_LOG_DEBUG, "Crawl completed " - "on %s", readdir_xl->name); - if (crawl_data->crawl == INDEX) - dirloc.path = NULL; + goto out; + } + + shd->split_brain = eh_new(AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->split_brain) + goto out; + + shd->statistics = GF_CALLOC(sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!shd->statistics) + goto out; + + for (i = 0; i < priv->child_count; i++) { + shd->statistics[i] = eh_new(AFR_STATISTICS_HISTORY_SIZE, _gf_false, + afr_destroy_crawl_event_data); + if (!shd->statistics[i]) + goto out; + shd->full_healers[i].crawl_event.child = i; + shd->full_healers[i].crawl_event.crawl_type = "FULL"; + shd->index_healers[i].crawl_event.child = i; + shd->index_healers[i].crawl_event.crawl_type = "INDEX"; + } + + ret = 0; out: - if (fd) - fd_unref (fd); - if (crawl_data->crawl == INDEX) - dirloc.path = NULL; - loc_wipe (&dirloc); - return ret; + return ret; } -static int -afr_dir_exclusive_crawl (void *data) +void +afr_selfheal_childup(xlator_t *this, afr_private_t *priv) { - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - gf_boolean_t crawl = _gf_false; - int ret = 0; - int child = -1; - xlator_t *this = NULL; - afr_crawl_data_t *crawl_data = data; - - this = THIS; - priv = this->private; - shd = &priv->shd; - child = crawl_data->child; - - LOCK (&priv->lock); - { - if (shd->inprogress[child]) { - if (shd->pending[child] != FULL) - shd->pending[child] = crawl_data->crawl; - } else { - shd->inprogress[child] = _gf_true; - crawl = _gf_true; - } - } - UNLOCK (&priv->lock); + int subvol = 0; - if (!crawl) { - gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress " - "for %s", priv->children[child]->name); - goto out; - } + if (!priv->shd.iamshd) + return; + for (subvol = 0; subvol < priv->child_count; subvol++) + if (priv->child_up[subvol]) + afr_shd_index_healer_spawn(this, subvol); - do { - afr_dir_crawl (data); - LOCK (&priv->lock); - { - if (shd->pending[child] != NONE) { - crawl_data->crawl = shd->pending[child]; - shd->pending[child] = NONE; - } else { - shd->inprogress[child] = _gf_false; - crawl = _gf_false; - } - } - UNLOCK (&priv->lock); - } while (crawl); -out: - return ret; + return; } -void -afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, - process_entry_cbk_t process_entry, void *op_data, - gf_boolean_t exclusive, int crawl_flags, - afr_crawl_done_cbk_t crawl_done) +int +afr_shd_get_index_count(xlator_t *this, int i, uint64_t *count) { - afr_private_t *priv = NULL; - call_frame_t *frame = NULL; - afr_crawl_data_t *crawl_data = NULL; - int ret = 0; - int (*crawler) (void*) = NULL; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + loc_t rootloc = { + 0, + }; + dict_t *xattr = NULL; + int ret = -1; - priv = this->private; + priv = this->private; + subvol = priv->children[i]; - frame = create_frame (this, this->ctx->pool); - if (!frame) - goto out; + rootloc.inode = inode_ref(this->itable->root); + gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid); - afr_set_lk_owner (frame, this, frame->root); - afr_set_low_priority (frame); - crawl_data = GF_CALLOC (1, sizeof (*crawl_data), - gf_afr_mt_crawl_data_t); - if (!crawl_data) - goto out; - crawl_data->process_entry = process_entry; - crawl_data->child = idx; - crawl_data->pid = frame->root->pid; - crawl_data->crawl = crawl; - crawl_data->op_data = op_data; - crawl_data->crawl_flags = crawl_flags; - gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s", - crawl_data->crawl, priv->children[idx]->name); - - if (exclusive) - crawler = afr_dir_exclusive_crawl; - else - crawler = afr_dir_crawl; - ret = synctask_new (this->ctx->env, crawler, - crawl_done, frame, crawl_data); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" - " %d with ret %d", idx, ret); -out: - return; -} + ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_COUNT, + NULL, NULL); + if (ret < 0) + goto out; -void -afr_build_root_loc (xlator_t *this, loc_t *loc) -{ - afr_private_t *priv = NULL; + ret = dict_get_uint64(xattr, GF_XATTROP_INDEX_COUNT, count); + if (ret) + goto out; + + ret = 0; + +out: + if (xattr) + dict_unref(xattr); + loc_wipe(&rootloc); - priv = this->private; - loc->path = gf_strdup ("/"); - loc->name = ""; - loc->inode = inode_ref (priv->root_inode); - uuid_copy (loc->gfid, loc->inode->gfid); + return ret; } int -afr_set_root_gfid (dict_t *dict) +afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) { - uuid_t gfid; - int ret = 0; + gf_xl_afr_op_t op = GF_SHD_OP_INVALID; + int ret = 0; + int xl_id = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + struct subvol_healer *healer = NULL; + int i = 0; + char key[64]; + int keylen = 0; + int this_name_len = 0; + int op_ret = 0; + uint64_t cnt = 0; + +#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \ + dict_str_len) \ + { \ + int ret; \ + \ + ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \ + if (ret) { \ + gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \ + "key=%s", key, "value=%s", dict_str, NULL); \ + } \ + } + + priv = this->private; + shd = &priv->shd; + + ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "key=xl-op", NULL); + goto out; + } + this_name_len = strlen(this->name); + ret = dict_get_int32n(input, this->name, this_name_len, &xl_id); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "key=%s", this->name, NULL); + goto out; + } + ret = dict_set_int32n(output, this->name, this_name_len, xl_id); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "key=%s", this->name, NULL); + goto out; + } + switch (op) { + case GF_SHD_OP_HEAL_INDEX: + op_ret = 0; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_NOT_CONNECTED, + SLEN(SBRICK_NOT_CONNECTED)); + op_ret = -1; + } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SLESS_THAN2_BRICKS_in_REP, + SLEN(SLESS_THAN2_BRICKS_in_REP)); + op_ret = -1; + } else if (!afr_shd_is_subvol_local(this, healer->subvol)) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_IS_REMOTE, + SLEN(SBRICK_IS_REMOTE)); + } else { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SSTARTED_SELF_HEAL, + SLEN(SSTARTED_SELF_HEAL)); - memset (gfid, 0, 16); - gfid[15] = 1; + ret = afr_shd_index_healer_spawn(this, i); - ret = afr_set_dict_gfid (dict, gfid); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_HEALER_SPAWN_FAILED, NULL); + } + } + } + break; + case GF_SHD_OP_HEAL_FULL: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->full_healers[i]; + keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_NOT_CONNECTED, + SLEN(SBRICK_NOT_CONNECTED)); + } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SLESS_THAN2_BRICKS_in_REP, + SLEN(SLESS_THAN2_BRICKS_in_REP)); + } else if (!afr_shd_is_subvol_local(this, healer->subvol)) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_IS_REMOTE, + SLEN(SBRICK_IS_REMOTE)); + } else { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SSTARTED_SELF_HEAL, + SLEN(SSTARTED_SELF_HEAL)); - return ret; -} + ret = afr_shd_full_healer_spawn(this, i); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_HEALER_SPAWN_FAILED, NULL); + } + op_ret = 0; + } + } + break; + case GF_SHD_OP_INDEX_SUMMARY: + /* this case has been handled in glfs-heal.c */ + break; + case GF_SHD_OP_SPLIT_BRAIN_FILES: + eh_dump(shd->split_brain, output, afr_add_shd_event); + break; + case GF_SHD_OP_STATISTICS: + for (i = 0; i < priv->child_count; i++) { + eh_dump(shd->statistics[i], output, afr_add_crawl_event); + ret = afr_shd_dict_add_crawl_event( + this, output, &shd->index_healers[i].crawl_event); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL); + } + + ret = afr_shd_dict_add_crawl_event( + this, output, &shd->full_healers[i].crawl_event); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL); + } + } + break; + case GF_SHD_OP_STATISTICS_HEAL_COUNT: + case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i]) { + keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, + i); + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_NOT_CONNECTED, + SLEN(SBRICK_NOT_CONNECTED)); + } else { + snprintf(key, sizeof(key), "%d-%d-hardlinks", xl_id, i); + ret = afr_shd_get_index_count(this, i, &cnt); + if (ret == 0) { + ret = dict_set_uint64(output, key, cnt); + } + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_DICT_SET_FAILED, NULL); + } + op_ret = 0; + } + } + + break; + + default: + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d", + op, NULL); + break; + } +out: + dict_deln(output, this->name, this_name_len); + return op_ret; + +#undef AFR_SET_DICT_AND_LOG +} diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index 32a8aaca50c..18db728ea7b 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,45 +8,68 @@ cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEALD_H__ -#define __AFR_SELF_HEALD_H__ -#include "xlator.h" +#ifndef _AFR_SELF_HEALD_H +#define _AFR_SELF_HEALD_H -#define IS_ROOT_PATH(path) (!strcmp (path, "/")) -#define IS_ENTRY_CWD(entry) (!strcmp (entry, ".")) -#define IS_ENTRY_PARENT(entry) (!strcmp (entry, "..")) -#define AFR_ALL_CHILDREN -1 +#include <pthread.h> -typedef struct afr_crawl_data_ { - int child; - pid_t pid; - afr_crawl_type_t crawl; - xlator_t *readdir_xl; - void *op_data; - int crawl_flags; - int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data, - gf_dirent_t *entry, loc_t *child, loc_t *parent, - struct iatt *iattr); -} afr_crawl_data_t; +typedef struct { + char *path; + int child; +} shd_event_t; -typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, loc_t *child, loc_t *parent, - struct iatt *iattr); +typedef struct { + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; -void afr_build_root_loc (xlator_t *this, loc_t *loc); + /* If start_time is 0, it means crawler is not in progress + and stats are not valid */ + time_t start_time; + /* If start_time is NOT 0 and end_time is 0, it means + cralwer is in progress */ + time_t end_time; + char *crawl_type; + int child; +} crawl_event_t; -int afr_set_root_gfid (dict_t *dict); +struct subvol_healer { + xlator_t *this; + crawl_event_t crawl_event; + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; +}; -void -afr_proactive_self_heal (void *data); +typedef struct { + struct subvol_healer *index_healers; + struct subvol_healer *full_healers; + + eh_t *split_brain; + eh_t **statistics; + int timeout; + uint32_t max_threads; + uint32_t wait_qlength; + uint32_t halo_max_latency_msec; + gf_boolean_t iamshd; + gf_boolean_t enabled; +} afr_self_heald_t; int -afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); +afr_selfheal_daemon_init(xlator_t *this); + +int +afr_xl_op(xlator_t *this, dict_t *input, dict_t *output); + +int +afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid, + char **path_p); -/* - * In addition to its self-heal use, this is used to find a local default - * read_child. - */ int -afr_local_pathinfo (char *pathinfo, gf_boolean_t *local); -#endif /* __AFR_SELF_HEALD_H__ */ +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, + ia_type_t type); +#endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 0b91f466e11..a51f79b1f43 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -8,1956 +8,2920 @@ cases as published by the Free Software Foundation. */ -#include "dict.h" -#include "byte-order.h" -#include "common-utils.h" -#include "timer.h" +#include <glusterfs/dict.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/timer.h> #include "afr.h" #include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-messages.h" #include <signal.h> +typedef enum { + AFR_TRANSACTION_PRE_OP, + AFR_TRANSACTION_POST_OP, +} afr_xattrop_type_t; -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path - of RENAME */ -#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ +static void +afr_lock_resume_shared(struct list_head *list); -afr_fd_ctx_t * -__afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ - uint64_t ctx = 0; - int ret = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int i = 0; - afr_private_t *priv = NULL; +static void +afr_post_op_handle_success(call_frame_t *frame, xlator_t *this); - priv = this->private; +static void +afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno); - ret = __fd_ctx_get (fd, this, &ctx); +void +__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared); - if (ret < 0 && fd_is_anonymous (fd)) { - ret = __afr_fd_ctx_set (this, fd); - if (ret < 0) - goto out; +void +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this); - ret = __fd_ctx_get (fd, this, &ctx); - if (ret < 0) - goto out; +int +afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - for (i = 0; i < priv->child_count; i++) - fd_ctx->opened_on[i] = AFR_FD_OPENED; - } +gf_boolean_t +afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; -out: - return fd_ctx; -} +gf_boolean_t +afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this); +int +afr_changelog_call_count(afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned char *failed_subvols, + unsigned int child_count); +int +afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op); -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ - afr_fd_ctx_t *fd_ctx = NULL; +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this); - LOCK(&fd->lock); - { - fd_ctx = __afr_fd_ctx_get (fd, this); - } - UNLOCK(&fd->lock); +static int +afr_ta_post_op_do(void *opaque); - return fd_ctx; -} +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local); +static int +afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this); static void -afr_save_lk_owner (call_frame_t *frame) -{ - afr_local_t * local = NULL; +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno); - local = frame->local; - - local->saved_lk_owner = frame->root->lk_owner; +void +afr_ta_locked_priv_invalidate(afr_private_t *priv) +{ + priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; + priv->release_ta_notify_dom_lock = _gf_false; + priv->ta_notify_dom_lock_offset = 0; } - static void -afr_restore_lk_owner (call_frame_t *frame) +afr_ta_process_waitq(xlator_t *this) { - afr_local_t * local = NULL; - - local = frame->local; - - frame->root->lk_owner = local->saved_lk_owner; + afr_local_t *entry = NULL; + afr_private_t *priv = this->private; + struct list_head waitq = { + 0, + }; + + INIT_LIST_HEAD(&waitq); + LOCK(&priv->lock); + list_splice_init(&priv->ta_waitq, &waitq); + UNLOCK(&priv->lock); + list_for_each_entry(entry, &waitq, ta_waitq) + { + afr_ta_decide_post_op_state(entry->transaction.frame, this); + } } -static void -__mark_all_pending (int32_t *pending[], int child_count, - afr_transaction_type type) +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque) { - int i = 0; - int j = 0; + afr_ta_process_waitq(ta_frame->this); + STACK_DESTROY(ta_frame->root); + return 0; +} - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (1); - } +int +afr_release_notify_lock_for_ta(void *opaque) +{ + xlator_t *this = NULL; + afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + int ret = -1; + + this = (xlator_t *)opaque; + priv = this->private; + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate loc for thin-arbiter."); + goto out; + } + flock.l_type = F_UNLCK; + flock.l_start = priv->ta_notify_dom_lock_offset; + flock.l_len = 1; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_NOTIFY lock."); + } + + LOCK(&priv->lock); + { + afr_ta_locked_priv_invalidate(priv); + } + UNLOCK(&priv->lock); +out: + loc_wipe(&loc); + return ret; } +void +afr_zero_fill_stat(afr_local_t *local) +{ + if (!local) + return; + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) { + gf_zero_fill_stat(&local->cont.inode_wfop.prebuf); + gf_zero_fill_stat(&local->cont.inode_wfop.postbuf); + } else if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + gf_zero_fill_stat(&local->cont.dir_fop.buf); + gf_zero_fill_stat(&local->cont.dir_fop.preparent); + gf_zero_fill_stat(&local->cont.dir_fop.postparent); + if (local->transaction.type == AFR_ENTRY_TRANSACTION) + return; + gf_zero_fill_stat(&local->cont.dir_fop.prenewparent); + gf_zero_fill_stat(&local->cont.dir_fop.postnewparent); + } +} -static void -__mark_child_dead (int32_t *pending[], int child_count, int child, - afr_transaction_type type) +/* In case of errors afr needs to choose which xdata from lower xlators it needs + * to unwind with. The way it is done is by checking if there are + * any good subvols which failed. Give preference to errnos other than + * ENOTCONN even if the child is source */ +void +afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1, + unsigned char *readable1, inode_t *inode2, + unsigned char *readable2) { - int j = 0; + int s = -1; /*selection*/ + int i = 0; + unsigned char *readable = NULL; + + if (local->xdata_rsp) { + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + } + + readable = alloca0(priv->child_count * sizeof(*readable)); + if (inode2 && readable2) { /*rename fop*/ + AFR_INTERSECT(readable, readable1, readable2, priv->child_count); + } else { + memcpy(readable, readable1, sizeof(*readable) * priv->child_count); + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + + if (local->replies[i].op_ret >= 0) + continue; + + if (local->replies[i].op_errno == ENOTCONN) + continue; + + /*Order is important in the following condition*/ + if ((s < 0) || (!readable[s] && readable[i])) + s = i; + } + + if (s != -1 && local->replies[s].xdata) { + local->xdata_rsp = dict_ref(local->replies[s].xdata); + } else if (s == -1) { + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - j = afr_index_for_transaction_type (type); + if (local->replies[i].op_ret >= 0) + continue; - pending[child][j] = 0; + if (!local->replies[i].xdata) + continue; + local->xdata_rsp = dict_ref(local->replies[i].xdata); + break; + } + } } +gf_boolean_t +afr_needs_changelog_update(afr_local_t *local) +{ + if (local->transaction.type == AFR_DATA_TRANSACTION) + return _gf_true; + if (!local->optimistic_change_log) + return _gf_true; + return _gf_false; +} -static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +gf_boolean_t +afr_changelog_has_quorum(afr_local_t *local, xlator_t *this) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + unsigned char *success_children = NULL; - local = frame->local; + priv = this->private; + success_children = alloca0(priv->child_count); - if (!local->fd) - return; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) { + success_children[i] = 1; + } + } - fd_ctx = afr_fd_ctx_get (local->fd, this); + if (afr_has_quorum(success_children, this, NULL)) { + return _gf_true; + } - if (!fd_ctx) - goto out; + return _gf_false; +} - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]++; - } - UNLOCK (&local->fd->lock); -out: - return; +gf_boolean_t +afr_is_write_subvol_valid(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + uint64_t write_subvol = 0; + unsigned char *writable = NULL; + uint16_t datamap = 0; + + local = frame->local; + priv = this->private; + writable = alloca0(priv->child_count); + + write_subvol = afr_write_subvol_get(frame, this); + datamap = (write_subvol & 0x00000000ffff0000) >> 16; + for (i = 0; i < priv->child_count; i++) { + if (datamap & (1 << i)) + writable[i] = 1; + + if (writable[i] && !local->transaction.failed_subvols[i]) + return _gf_true; + } + + return _gf_false; } -static void -__mark_non_participant_children (int32_t *pending[], int child_count, - unsigned char *participants, - afr_transaction_type type) +int +afr_transaction_fop(call_frame_t *frame, xlator_t *this) { - int i = 0; - int j = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + unsigned char *failed_subvols = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + failed_subvols = local->transaction.failed_subvols; + call_count = priv->child_count - + AFR_COUNT(failed_subvols, priv->child_count); + /* Fail if pre-op did not succeed on quorum no. of bricks. */ + if (!afr_changelog_has_quorum(local, this) || !call_count) { + local->op_ret = -1; + /* local->op_errno is already captured in changelog cbk. */ + afr_transaction_resume(frame, this); + return 0; + } + + /* Fail if at least one writeable brick isn't up.*/ + if (local->transaction.type == AFR_DATA_TRANSACTION && + !afr_is_write_subvol_valid(frame, this)) { + local->op_ret = -1; + local->op_errno = EIO; + afr_transaction_resume(frame, this); + return 0; + } - j = afr_index_for_transaction_type (type); - for (i = 0; i < child_count; i++) { - if (!participants[i]) - pending[i][j] = 0; + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && !failed_subvols[i]) { + local->transaction.wind(frame, this, i); + + if (!--call_count) + break; } -} + } + return 0; +} -void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type) +int +afr_transaction_done(call_frame_t *frame, xlator_t *this) { - int i; - int j; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t unwind = _gf_false; + afr_lock_t *lock = NULL; + afr_local_t *lock_local = NULL; + + priv = this->private; + local = frame->local; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (-1); + if (priv->consistent_metadata) { + LOCK(&frame->lock); + { + unwind = (local->transaction.main_frame != NULL); + } + UNLOCK(&frame->lock); + if (unwind) /*It definitely did post-op*/ + afr_zero_fill_stat(local); + } + + if (local->transaction.do_eager_unlock) { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + lock->acquired = _gf_false; + lock->release = _gf_false; + list_splice_init(&lock->frozen, &lock->waiting); + if (list_empty(&lock->waiting)) + goto unlock; + lock_local = list_entry(lock->waiting.next, afr_local_t, + transaction.wait_list); + list_del_init(&lock_local->transaction.wait_list); + list_add(&lock_local->transaction.owner_list, &lock->owners); } + unlock: + UNLOCK(&local->inode->lock); + } + if (lock_local) { + afr_lock(lock_local->transaction.frame, + lock_local->transaction.frame->this); + } + local->transaction.unwind(frame, this); + + GF_ASSERT(list_empty(&local->transaction.owner_list)); + GF_ASSERT(list_empty(&local->transaction.wait_list)); + AFR_STACK_DESTROY(frame); + + return 0; } -void -_set_all_child_errno (int *child_errno, unsigned int child_count) +static void +afr_lock_fail_shared(afr_local_t *local, struct list_head *list) { - int i = 0; - - for (i = 0; i < child_count; i++) - if (child_errno[i] == 0) - child_errno[i] = ENOTCONN; + afr_local_t *each = NULL; + + while (!list_empty(list)) { + each = list_entry(list->next, afr_local_t, transaction.wait_list); + list_del_init(&each->transaction.wait_list); + each->op_ret = -1; + each->op_errno = local->op_errno; + afr_transaction_done(each->transaction.frame, + each->transaction.frame->this); + } } -void -afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) +static void +afr_handle_lock_acquire_failure(afr_local_t *local) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; + struct list_head shared; + afr_lock_t *lock = NULL; - local = frame->local; - priv = this->private; - fd = local->fd; + if (!local->transaction.eager_lock_on) + goto out; - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + lock = &local->inode_ctx->lock[local->transaction.type]; - _set_all_child_errno (local->child_errno, priv->child_count); + INIT_LIST_HEAD(&shared); + LOCK(&local->inode->lock); + { + lock->release = _gf_true; + list_splice_init(&lock->waiting, &shared); + } + UNLOCK(&local->inode->lock); - /* Perform fops with the lk-owner from top xlator. - * Eg: lk-owner of posix-lk and flush should be same, - * flush cant clear the posix-lks without that lk-owner. - */ - afr_save_lk_owner (frame); - frame->root->lk_owner = - local->transaction.main_frame->root->lk_owner; - - - /* The wake up needs to happen independent of - what type of fop arrives here. If it was - a write, then it has already inherited the - lock and changelog. If it was not a write, - then the presumption of the optimization (of - optimizing for successive write operations) - fails. - */ - if (fd) - afr_delayed_changelog_wake_up (this, fd); - local->transaction.fop (frame, this); + afr_lock_fail_shared(local, &shared); + local->transaction.do_eager_unlock = _gf_true; +out: + local->internal_lock.lock_cbk = afr_transaction_done; + afr_unlock(local->transaction.frame, local->transaction.frame->this); } - -static int -__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +call_frame_t * +afr_transaction_detach_fop_frame(call_frame_t *frame) { - int ret = 0; + afr_local_t *local = NULL; + call_frame_t *fop_frame = NULL; - switch (type) { - case AFR_DATA_TRANSACTION: - if (priv->data_change_log) - ret = 1; + local = frame->local; - break; + afr_handle_inconsistent_fop(frame, &local->op_ret, &local->op_errno); + LOCK(&frame->lock); + { + fop_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK(&frame->lock); - case AFR_METADATA_TRANSACTION: - if (priv->metadata_change_log) - ret = 1; + return fop_frame; +} - break; +static void +afr_save_lk_owner(call_frame_t *frame) +{ + afr_local_t *local = NULL; - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - if (priv->entry_change_log) - ret = 1; + local = frame->local; - break; - } - - return ret; + local->saved_lk_owner = frame->root->lk_owner; } - -static int -__fop_changelog_needed (call_frame_t *frame, xlator_t *this) +static void +afr_restore_lk_owner(call_frame_t *frame) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int op_ret = 0; - afr_transaction_type type = -1; - - priv = this->private; - local = frame->local; - type = local->transaction.type; + afr_local_t *local = NULL; - if (__changelog_enabled (priv, type)) { - switch (local->op) { + local = frame->local; - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - op_ret = 1; - break; + frame->root->lk_owner = local->saved_lk_owner; +} - case GF_FOP_FLUSH: - op_ret = 0; - break; +void +__mark_all_success(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i; - default: - op_ret = 1; - } - } + local = frame->local; + priv = this->private; - return op_ret; + for (i = 0; i < priv->child_count; i++) { + local->transaction.failed_subvols[i] = 0; + } } -int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - int child, afr_xattrop_type_t op) +void +afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this) { - int i = 0; - int ret = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_transaction_type type = -1; + dict_t *xdata = NULL; + int **matrix = NULL; + int idx = -1; + int i = 0; + int j = 0; + + priv = this->private; + local = frame->local; + type = local->transaction.type; + idx = afr_index_for_transaction_type(type); + matrix = ALLOC_MATRIX(priv->child_count, int); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.changelog_xdata[i]) + continue; + xdata = local->transaction.changelog_xdata[i]; + afr_selfheal_fill_matrix(this, matrix, i, idx, xdata); + } + + memset(local->transaction.pre_op_sources, 1, priv->child_count); + + /*If lock or pre-op failed on a brick, it is not a source. */ + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->transaction.pre_op_sources[i] = 0; + } + + /* If brick is blamed by others, it is not a source. */ + for (i = 0; i < priv->child_count; i++) + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j] != 0) + local->transaction.pre_op_sources[j] = 0; +} - if (op == LOCAL_FIRST) { - ret = dict_set_static_bin (xattr, priv->pending_key[child], - pending[child], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret) - goto out; - } - for (i = 0; i < priv->child_count; i++) { - if (i == child) - continue; - ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - /* 3 = data+metadata+entry */ +void +afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_sources_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + + afr_compute_pre_op_sources(frame, this); + pre_op_sources_count = AFR_COUNT(local->transaction.pre_op_sources, + priv->child_count); + + /* If arbiter is the only source, do not proceed. */ + if (pre_op_sources_count < 2 && + local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + for (i = 0; i < priv->child_count; i++) + local->transaction.failed_subvols[i] = 1; + } + + afr_transaction_fop(frame, this); + + return; +} - if (ret < 0) - goto out; +int +afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int ret = 0; + int failure_count = 0; + struct list_head shared; + afr_lock_t *lock = NULL; + + local = frame->local; + priv = this->private; + + INIT_LIST_HEAD(&shared); + if (local->transaction.type == AFR_DATA_TRANSACTION && + !local->transaction.inherited) { + ret = afr_write_subvol_set(frame, this); + if (ret) { + /*act as if operation failed on all subvols*/ + local->op_ret = -1; + local->op_errno = -ret; + for (i = 0; i < priv->child_count; i++) + local->transaction.failed_subvols[i] = 1; } - if (op == LOCAL_LAST) { - ret = dict_set_static_bin (xattr, priv->pending_key[child], - pending[child], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret) - goto out; + } + + if (local->pre_op_compat) + /* old mode, pre-op was done as afr_changelog_do() + just now, before OP */ + afr_changelog_pre_op_update(frame, this); + + if (!local->transaction.eager_lock_on || local->transaction.inherited) + goto fop; + failure_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + if (failure_count == priv->child_count) { + afr_handle_lock_acquire_failure(local); + return 0; + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + lock->acquired = _gf_true; + __afr_transaction_wake_shared(local, &shared); } -out: - return ret; + UNLOCK(&local->inode->lock); + } + +fop: + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. + */ + afr_save_lk_owner(frame); + frame->root->lk_owner = local->transaction.main_frame->root->lk_owner; + + if (priv->arbiter_count == 1) { + afr_txn_arbitrate_fop(frame, this); + } else { + afr_transaction_fop(frame, this); + } + + afr_lock_resume_shared(&shared); + return 0; } int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending) { - int ret = 0; + int i = 0; + int ret = 0; - switch (type) { - case AFR_DATA_TRANSACTION: - ret = priv->child_count; - break; + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_static_bin(xattr, priv->pending_key[i], pending[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + /* 3 = data+metadata+entry */ - case AFR_METADATA_TRANSACTION: - ret = priv->child_count; - break; + if (ret) + break; + } - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - ret = priv->child_count; - break; - } - - return ret; + return ret; } -/* {{{ pending */ - -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +static void +afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int call_count = -1; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - call_count = --local->call_count; + afr_private_t *priv = this->private; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + gf_boolean_t release = _gf_false; + + LOCK(&priv->lock); + { + /*Once we get notify lock release upcall notification, + if any of the fop state counters are non-zero, we will + not release the lock. + */ + onwire_count = priv->ta_on_wire_txn_count; + inmem_count = priv->ta_in_mem_txn_count; + switch (fop_state) { + case TA_GET_INFO_FROM_TA_FILE: + onwire_count = --priv->ta_on_wire_txn_count; + break; + case TA_INFO_IN_MEMORY_SUCCESS: + case TA_INFO_IN_MEMORY_FAILED: + inmem_count = --priv->ta_in_mem_txn_count; + break; + case TA_WAIT_FOR_NOTIFY_LOCK_REL: + GF_ASSERT(0); + break; + case TA_SUCCESS: + break; } - UNLOCK (&frame->lock); - - if (call_count == 0) { - if (local->transaction.resume_stub) { - call_resume (local->transaction.resume_stub); - local->transaction.resume_stub = NULL; - } + release = priv->release_ta_notify_dom_lock; + } + UNLOCK(&priv->lock); - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } - } + if (inmem_count != 0 || release == _gf_false || onwire_count != 0) + return; - return 0; + afr_ta_lock_release_synctask(this); } - -void -afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, - inode_t *inode, afr_transaction_type type) -{ - int i = -1; - int count = 0; - int read_child = -1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int **pending = NULL; - int idx = 0; - int32_t *stale_children = NULL; - int32_t *fresh_children = NULL; - gf_boolean_t rm_stale_children = _gf_false; - - idx = afr_index_for_transaction_type (type); - - priv = this->private; - local = frame->local; - pending = local->pending; - - if (local->op_ret < 0) - goto out; - fresh_children = local->fresh_children; - read_child = afr_inode_get_read_ctx (this, inode, fresh_children); - if (read_child < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain " - "for %s", uuid_utoa (inode->gfid)); - goto out; +static void +afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *entry = NULL; + int bad_child = AFR_CHILD_UNKNOWN; + + struct list_head onwireq = { + 0, + }; + INIT_LIST_HEAD(&onwireq); + + LOCK(&priv->lock); + { + bad_child = priv->ta_bad_child_index; + if (bad_child == AFR_CHILD_UNKNOWN) { + /*The previous on-wire ta_post_op was a failure. Just dequeue + *one element to wind on-wire again. */ + entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq); + list_del_init(&entry->ta_onwireq); + } else { + /* Prepare to process all fops based on bad_child_index. */ + list_splice_init(&priv->ta_onwireq, &onwireq); } + } + UNLOCK(&priv->lock); - for (i = 0; i < priv->child_count; i++) { - if (!afr_is_child_present (fresh_children, - priv->child_count, i)) - continue; - if (pending[i][idx]) - continue; - /* child is down or op failed on it */ - if (!stale_children) - stale_children = afr_children_create (priv->child_count); - if (!stale_children) - goto out; - - rm_stale_children = _gf_true; - stale_children[count++] = i; - gf_log (this->name, GF_LOG_DEBUG, "Removing stale child " - "%d for %s", i, uuid_utoa (inode->gfid)); + if (entry) { + afr_ta_post_op_synctask(this, entry); + return; + } else { + while (!list_empty(&onwireq)) { + entry = list_entry(onwireq.next, afr_local_t, ta_onwireq); + list_del_init(&entry->ta_onwireq); + if (entry->ta_failed_subvol == bad_child) { + afr_post_op_handle_success(entry->transaction.frame, this); + } else { + afr_post_op_handle_failure(entry->transaction.frame, this, EIO); + } } + } +} - if (!rm_stale_children) - goto out; - - afr_inode_rm_stale_children (this, inode, stale_children); -out: - GF_FREE (stale_children); - return; +int +afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + int_lock = &local->internal_lock; + + if (priv->thin_arbiter_count) { + /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */ + afr_ta_dom_lock_check_and_release(local->fop_state, this); + } + + /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ + if (!afr_changelog_has_quorum(local, this)) { + local->op_ret = -1; + /*local->op_errno is already captured in changelog cbk*/ + } + + if (local->transaction.resume_stub) { + call_resume(local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } + + int_lock->lock_cbk = afr_transaction_done; + afr_unlock(frame, this); + + return 0; } -afr_inodelk_t* -afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +static void +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno) { - afr_inodelk_t *inodelk = NULL; - int i = 0; + afr_local_t *local = frame->local; + local->op_ret = -1; + local->op_errno = op_errno; - for (i = 0; int_lock->inodelk[i].domain; i++) { - inodelk = &int_lock->inodelk[i]; - if (strcmp (dom, inodelk->domain) == 0) - return inodelk; - } - return NULL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB, + "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op], + uuid_utoa(local->inode->gfid), local->fop_state); + + afr_changelog_post_op_done(frame, this); } -unsigned char* -afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) +unsigned char * +afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock) { - unsigned char *locked_nodes = NULL; - afr_inodelk_t *inodelk = NULL; - switch (type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - locked_nodes = inodelk->locked_nodes; - break; - - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - /*Because same set of subvols participate in all lockee - * entities*/ - locked_nodes = int_lock->lockee[0].locked_nodes; - break; - } - return locked_nodes; + /*Because same set of subvols participate in all lockee + * entities*/ + return int_lock->lockee[0].locked_nodes; } int -afr_changelog_pre_op_call_count (afr_transaction_type type, - afr_internal_lock_t *int_lock, - unsigned int child_count) +afr_changelog_call_count(afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned char *failed_subvols, + unsigned int child_count) { - int call_count = 0; - unsigned char *locked_nodes = NULL; + int i = 0; + int call_count = 0; - locked_nodes = afr_locked_nodes_get (type, int_lock); - GF_ASSERT (locked_nodes); + for (i = 0; i < child_count; i++) { + if (pre_op_subvols[i] && !failed_subvols[i]) { + call_count++; + } + } - call_count = afr_locked_children_count (locked_nodes, child_count); - if (type == AFR_ENTRY_RENAME_TRANSACTION) - call_count *= 2; + if (type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; - return call_count; + return call_count; } -int -afr_changelog_post_op_call_count (afr_transaction_type type, - unsigned char *pre_op, - unsigned int child_count) +gf_boolean_t +afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this) { - int call_count = 0; - - call_count = afr_pre_op_done_children_count (pre_op, child_count); - if (type == AFR_ENTRY_RENAME_TRANSACTION) - call_count *= 2; - - return call_count; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + if (priv->thin_arbiter_count) { + /* We need to perform post-op even if 1 data brick was down + * before the txn started.*/ + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count)) + return _gf_false; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + local->transaction.failed_subvols[i]) + return _gf_false; + } + + return _gf_true; } void -afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) +afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this) { - int i = 0; - int index = 0; - int32_t postop = 0; - int32_t preop = 1; - int32_t **txn_changelog = NULL; + if (afr_is_symmetric_error(frame, this)) + __mark_all_success(frame, this); +} - txn_changelog = local->transaction.txn_changelog; - index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { - postop = ntoh32 (local->pending[i][index]); - txn_changelog[i][index] = hton32 (postop + preop); +gf_boolean_t +afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame) +{ + unsigned int quorum_count = 0; + afr_private_t *priv = NULL; + unsigned int up_children_count = 0; + + priv = this->private; + up_children_count = AFR_COUNT(subvols, priv->child_count); + + if (afr_lookup_has_quorum(frame, up_children_count)) + return _gf_true; + + if (priv->quorum_count == AFR_QUORUM_AUTO) { + /* + * Special case for auto-quorum with an even number of nodes. + * + * A replica set with even count N can only handle the same + * number of failures as odd N-1 before losing "vanilla" + * quorum, and the probability of more simultaneous failures is + * actually higher. For example, with a 1% chance of failure + * we'd have a 0.03% chance of two simultaneous failures with + * N=3 but a 0.06% chance with N=4. However, the special case + * is necessary for N=2 because there's no real quorum in that + * case (i.e. can't normally survive *any* failures). In that + * case, we treat the first node as a tie-breaker, allowing + * quorum to be retained in some cases while still honoring the + * all-important constraint that there can not simultaneously + * be two partitioned sets of nodes each believing they have + * quorum. Of two equally sized sets, the one without that + * first node will lose. + * + * It turns out that the special case is beneficial for higher + * values of N as well. Continuing the example above, the + * probability of losing quorum with N=4 and this type of + * quorum is (very) slightly lower than with N=3 and vanilla + * quorum. The difference becomes even more pronounced with + * higher N. Therefore, even though such replica counts are + * unlikely to be seen in practice, we might as well use the + * "special" quorum then as well. + */ + if ((up_children_count * 2) == priv->child_count) { + return subvols[0]; } + } + + if (priv->quorum_count == AFR_QUORUM_AUTO) { + quorum_count = priv->child_count / 2 + 1; + } else { + quorum_count = priv->quorum_count; + } + + if (up_children_count >= quorum_count) + return _gf_true; + + return _gf_false; } -afr_xattrop_type_t -afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, - afr_transaction_type type) +static gf_boolean_t +afr_has_fop_quorum(call_frame_t *frame) { - int index = 0; - afr_xattrop_type_t op = LOCAL_LAST; + xlator_t *this = frame->this; + afr_local_t *local = frame->local; + unsigned char *locked_nodes = NULL; - index = afr_index_for_transaction_type (type); - if (optimized && !pending[child][index]) - op = LOCAL_FIRST; - return op; + locked_nodes = afr_locked_nodes_get(local->transaction.type, + &local->internal_lock); + return afr_has_quorum(locked_nodes, this, NULL); } -void -afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, - int optimized, int child) -{ - int32_t **txn_changelog = NULL; - int32_t **changelog = NULL; - afr_private_t *priv = NULL; - int ret = 0; - afr_xattrop_type_t op = LOCAL_LAST; - - priv = this->private; - txn_changelog = local->transaction.txn_changelog; - op = afr_get_postop_xattrop_type (local->pending, optimized, child, - local->transaction.type); - if (optimized) - changelog = txn_changelog; - else - changelog = local->pending; - ret = afr_set_pending_dict (priv, xattr, changelog, child, op); - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); +static gf_boolean_t +afr_has_fop_cbk_quorum(call_frame_t *frame) +{ + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + unsigned char *success = alloca0(priv->child_count); + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) + if (!local->transaction.failed_subvols[i]) + success[i] = 1; + } + + return afr_has_quorum(success, this, NULL); } - gf_boolean_t -afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +afr_need_dirty_marking(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int index = -1; - int i = 0; + afr_private_t *priv = this->private; + afr_local_t *local = NULL; + gf_boolean_t need_dirty = _gf_false; - local = frame->local; - priv = this->private; + local = frame->local; - index = afr_index_for_transaction_type (local->transaction.type); + if (!priv->quorum_count || !local->optimistic_change_log) + return _gf_false; - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) - return _gf_false; - } + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) + return _gf_false; - return _gf_true; + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == + priv->child_count) + return _gf_false; + + if (!afr_has_fop_cbk_quorum(frame)) + need_dirty = _gf_true; + + return need_dirty; } -static void -afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) +void +afr_handle_quorum(call_frame_t *frame, xlator_t *this) { - xlator_t *this = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + const char *file = NULL; + uuid_t gfid = {0}; - this = frame->this; - local = frame->local; - priv = this->private; + local = frame->local; + priv = frame->this->private; - if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && - (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) - return; + if (priv->quorum_count == 0) + return; - if (local->op_ret >= 0) - goto out; + /* If the fop already failed return right away to preserve errno */ + if (local->op_ret == -1) + return; - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); -out: + /* + * Network split may happen just after the fops are unwound, so check + * if the fop succeeded in a way it still follows quorum. If it doesn't, + * mark the fop as failure, mark the changelogs so it reflects that + * failure. + * + * Scenario: + * There are 3 mounts on 3 machines(node1, node2, node3) all writing to + * single file. Network split happened in a way that node1 can't see + * node2, node3. Node2, node3 both of them can't see node1. Now at the + * time of sending write all the bricks are up. Just after write fop is + * wound on node1, network split happens. Node1 thinks write fop failed + * on node2, node3 so marks pending changelog for those 2 extended + * attributes on node1. Node2, node3 thinks writes failed on node1 so + * they mark pending changelog for node1. When the network is stable + * again the file already is in split-brain. These checks prevent + * marking pending changelog on other subvolumes if the fop doesn't + * succeed in a way it is still following quorum. So with this fix what + * is happening is, node1 will have all pending changelog(FOOL) because + * the write succeeded only on node1 but failed on node2, node3 so + * instead of marking pending changelogs on node2, node3 it just treats + * the fop as failure and goes into DIRTY state. Where as node2, node3 + * say they are sources and have pending changelog to node1 so there is + * no split-brain with the fix. The problem is eliminated completely. + */ + + if (afr_has_fop_cbk_quorum(frame)) return; + + if (afr_need_dirty_marking(frame, this)) + goto set_response; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) + afr_transaction_fop_failed(frame, frame->this, i); + } + +set_response: + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + if (local->op_errno == 0) + local->op_errno = afr_quorum_errno(priv); + + if (local->fd) { + gf_uuid_copy(gfid, local->fd->inode->gfid); + file = uuid_utoa(gfid); + } else { + loc_path(&local->loc, local->loc.name); + file = local->loc.path; + } + + gf_msg(frame->this->name, GF_LOG_WARNING, local->op_errno, + AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met", file, + gf_fop_list[local->op]); + + switch (local->transaction.type) { + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + afr_pick_error_xdata(local, priv, local->parent, local->readable, + local->parent2, local->readable2); + break; + default: + afr_pick_error_xdata(local, priv, local->inode, local->readable, + NULL, NULL); + break; + } } -static void -afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) +int +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop) { - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - gf_boolean_t all_quota_failures = _gf_false; + afr_private_t *priv = NULL; + + priv = this->private; + loc->parent = inode_ref(priv->root_inode); + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) { + /* Except afr_ta_id_file_check() which is path based, all other gluster + * FOPS need gfid.*/ + return -EINVAL; + } + gf_uuid_copy(loc->gfid, priv->ta_gfid); + loc->inode = inode_new(loc->parent->table); + if (!loc->inode) { + loc_wipe(loc); + return -ENOMEM; + } + return 0; +} - local = frame->local; - priv = this->private; - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; - /* - * Idea is to not leave the file in FOOL-FOOL scenario in case on - * all the bricks data transaction failed with EDQUOT to avoid - * increasing un-necessary load of self-heals in the system. - */ - all_quota_failures = _gf_true; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i] && - (local->child_errno[i] != EDQUOT)) { - all_quota_failures = _gf_false; - break; - } - } - if (all_quota_failures) - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); +static int +afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque) +{ + xlator_t *this = NULL; + afr_local_t *local = NULL; + call_frame_t *txn_frame = NULL; + afr_ta_fop_state_t fop_state; + + local = (afr_local_t *)opaque; + fop_state = local->fop_state; + txn_frame = local->transaction.frame; + this = frame->this; + + if (ret == 0) { + /*Mark pending xattrs on the up data brick.*/ + afr_post_op_handle_success(txn_frame, this); + } else { + afr_post_op_handle_failure(txn_frame, this, -ret); + } + + STACK_DESTROY(frame->root); + afr_ta_process_onwireq(fop_state, this); + + return 0; } -int -afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) +int ** +afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending, + dict_t *xattr, afr_local_t *local) { - afr_private_t * priv = this->private; - afr_internal_lock_t *int_lock = NULL; - int i = 0; - int call_count = 0; + int **changelog = NULL; + int idx = 0; + int ret = 0; + int i; + + if (local->is_new_entry == _gf_true) { + changelog = afr_mark_pending_changelog(priv, pending, xattr, + local->cont.dir_fop.buf.ia_type); + } else { + idx = afr_index_for_transaction_type(local->transaction.type); + changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) { + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + changelog[i][idx] = hton32(1); + } + ret = afr_set_pending_dict(priv, xattr, changelog); + if (ret < 0) { + afr_matrix_cleanup(changelog, priv->child_count); + return NULL; + } + } - afr_local_t * local = NULL; - afr_fd_ctx_t *fdctx = NULL; - dict_t **xattr = NULL; - int piggyback = 0; - int nothing_failed = 1; +out: + return changelog; +} - local = frame->local; - int_lock = &local->internal_lock; +static void +afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local, + gf_boolean_t *valid) +{ + if (priv->ta_event_gen > local->ta_event_gen) { + /* We can't trust the ta's response anymore.*/ + afr_ta_locked_priv_invalidate(priv); + *valid = _gf_false; + return; + } + return; +} - __mark_non_participant_children (local->pending, priv->child_count, - local->transaction.pre_op, - local->transaction.type); +static int +afr_ta_post_op_do(void *opaque) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t *this = NULL; + dict_t *xattr = NULL; + unsigned char *pending = NULL; + int **changelog = NULL; + int failed_subvol = -1; + int success_subvol = -1; + loc_t loc = { + 0, + }; + int i = 0; + int ret = 0; + gf_boolean_t valid = _gf_true; + + local = (afr_local_t *)opaque; + this = local->transaction.frame->this; + priv = this->private; + + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate loc for thin-arbiter."); + goto out; + } + + xattr = dict_new(); + if (!xattr) { + ret = -ENOMEM; + goto out; + } + + pending = alloca0(priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) { + pending[i] = 1; + failed_subvol = i; + } else { + success_subvol = i; + } + } + + changelog = afr_set_changelog_xattr(priv, pending, xattr, local); + + if (!changelog) { + ret = -ENOMEM; + goto out; + } + + ret = afr_ta_post_op_lock(this, &loc); + if (ret) + goto out; + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s failed for gfid %s.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa(local->inode->gfid)); + } + LOCK(&priv->lock); + { + if (ret == 0) { + priv->ta_bad_child_index = failed_subvol; + } else if (ret == -EINVAL) { + priv->ta_bad_child_index = success_subvol; + ret = -EIO; /* TA failed the fop. Return EIO to application. */ + } - afr_data_handle_quota_errors (frame, this); - afr_dir_fop_handle_all_fop_failures (frame); + afr_ta_locked_xattrop_validate(priv, local, &valid); + } + UNLOCK(&priv->lock); + if (valid == _gf_false) { + gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s for gfid %s invalidated due " + "to event-gen mismatch.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa(local->inode->gfid)); + ret = -EIO; + } + + afr_ta_post_op_unlock(this, &loc); +out: + if (xattr) + dict_unref(xattr); - if (local->fd) - afr_transaction_rm_stale_children (frame, this, - local->fd->inode, - local->transaction.type); + if (changelog) + afr_matrix_cleanup(changelog, priv->child_count); - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - for (i = 0; i < priv->child_count; i++) { - xattr[i] = dict_new (); - } + loc_wipe(&loc); - call_count = afr_changelog_post_op_call_count (local->transaction.type, - local->transaction.pre_op, - priv->child_count); - local->call_count = call_count; + return ret; +} - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local) +{ + call_frame_t *ta_frame = NULL; + int ret = 0; + + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + goto err; + } + ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done, + ta_frame, local); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to launch post-op on thin arbiter for gfid %s", + uuid_utoa(local->inode->gfid)); + STACK_DESTROY(ta_frame->root); + goto err; + } + + return ret; +err: + afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM); + return ret; +} - if (call_count == 0) { - /* no child is up */ - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - goto out; +static void +afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local, + int *on_wire_count) +{ + LOCK(&priv->lock); + { + if (priv->release_ta_notify_dom_lock == _gf_true) { + /* Put the fop in waitq until notify dom lock is released.*/ + local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL; + list_add_tail(&local->ta_waitq, &priv->ta_waitq); + } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) { + /* Post-op on thin-arbiter to decide success/failure. */ + local->fop_state = TA_GET_INFO_FROM_TA_FILE; + *on_wire_count = ++priv->ta_on_wire_txn_count; + if (*on_wire_count > 1) { + /*Avoid sending multiple on-wire post-ops on TA*/ + list_add_tail(&local->ta_onwireq, &priv->ta_onwireq); + } + } else if (local->ta_failed_subvol == priv->ta_bad_child_index) { + /* Post-op on TA not needed as the fop failed on the in-memory bad + * brick. Just mark pending xattrs on the good data brick.*/ + local->fop_state = TA_INFO_IN_MEMORY_SUCCESS; + priv->ta_in_mem_txn_count++; + } else { + /* Post-op on TA not needed as the fop succeeded only on the + * in-memory bad data brick and not the good one. Fail the fop.*/ + local->fop_state = TA_INFO_IN_MEMORY_FAILED; + priv->ta_in_mem_txn_count++; } + } + UNLOCK(&priv->lock); +} - nothing_failed = afr_txn_nothing_failed (frame, this); +static void +afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local) +{ + int i = 0; - afr_compute_txn_changelog (local , priv); + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) { + local->ta_failed_subvol = i; + break; + } + } +} - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; +static void +afr_post_op_handle_success(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; - if (local->transaction.type != AFR_DATA_TRANSACTION) - afr_set_postop_dict (local, this, xattr[i], - local->optimistic_change_log, i); - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - afr_set_postop_dict (local, this, xattr[i], - 0, i); - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - break; - } - - /* local->transaction.postop_piggybacked[] was - precomputed in is_piggyback_postop() when called from - afr_changelog_post_op_safe() - */ - - piggyback = 0; - if (local->transaction.postop_piggybacked[i]) - piggyback = 1; - - afr_set_postop_dict (local, this, xattr[i], - piggyback, i); - - if (nothing_failed && piggyback) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], NULL); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - } - break; - case AFR_METADATA_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } - - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; + local = frame->local; + if (local->is_new_entry == _gf_true) { + afr_mark_new_entry_changelog(frame, this); + } + afr_changelog_post_op_do(frame, this); - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - call_count--; - } + return; +} - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ +static void +afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno) +{ + afr_changelog_post_op_fail(frame, this, op_errno); - afr_set_postop_dict (local, this, xattr[i], - local->optimistic_change_log, i); + return; +} - /* fall through */ +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int on_wire_count = 0; + + priv = this->private; + local = frame->local; + + afr_ta_set_fop_state(priv, local, &on_wire_count); + + switch (local->fop_state) { + case TA_GET_INFO_FROM_TA_FILE: + if (on_wire_count == 1) + afr_ta_post_op_synctask(this, local); + /*else, fop is queued in ta_onwireq.*/ + break; + case TA_WAIT_FOR_NOTIFY_LOCK_REL: + /*Post releasing the notify lock, we will act on this queue*/ + break; + case TA_INFO_IN_MEMORY_SUCCESS: + afr_post_op_handle_success(frame, this); + break; + case TA_INFO_IN_MEMORY_FAILED: + afr_post_op_handle_failure(frame, this, EIO); + break; + default: + break; + } + return; +} - case AFR_ENTRY_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } - - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - } +static void +afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + + afr_ta_fill_failed_subvol(priv, local); + gf_msg_debug(this->name, 0, + "Fop failed on data brick (%s) for gfid=%s. " + "ta info needed to decide fop result.", + priv->children[local->ta_failed_subvol]->name, + uuid_utoa(local->inode->gfid)); + afr_ta_decide_post_op_state(frame, this); +} - if (!--call_count) - break; +void +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = NULL; + dict_t *xattr = NULL; + int i = 0; + int ret = 0; + int idx = 0; + int nothing_failed = 1; + gf_boolean_t need_undirty = _gf_false; + + afr_handle_quorum(frame, this); + local = frame->local; + idx = afr_index_for_transaction_type(local->transaction.type); + + xattr = dict_new(); + if (!xattr) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + nothing_failed = afr_txn_nothing_failed(frame, this); + + if (afr_changelog_pre_op_uninherit(frame, this)) + need_undirty = _gf_false; + else + need_undirty = _gf_true; + + if (local->op_ret < 0 && !nothing_failed) { + if (afr_need_dirty_marking(frame, this)) { + local->dirty[idx] = hton32(1); + goto set_dirty; } + afr_changelog_post_op_done(frame, this); + goto out; + } + + if (nothing_failed && !need_undirty) { + afr_changelog_post_op_done(frame, this); + goto out; + } + + if (local->transaction.in_flight_sb) { + afr_changelog_post_op_fail(frame, this, + local->transaction.in_flight_sb_errno); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->pending[i][idx] = hton32(1); + } + + ret = afr_set_pending_dict(priv, xattr, local->pending); + if (ret < 0) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + if (need_undirty) + local->dirty[idx] = hton32(-1); + else + local->dirty[idx] = hton32(0); + +set_dirty: + ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + afr_changelog_do(frame, this, xattr, afr_changelog_post_op_done, + AFR_TRANSACTION_POST_OP); out: - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + if (xattr) + dict_unref(xattr); - return 0; + return; } - -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +static int +afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = this->private; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - - LOCK (&frame->lock); - { - switch (op_ret) { - case 0: - __mark_pre_op_done_on_fd (frame, this, child_index); - //fallthrough we need to mark the pre_op - case 1: - local->transaction.pre_op[child_index] = 1; - /* special op_ret for piggyback */ - break; - case -1: - if (op_errno == ENOTSUP) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop not supported by %s", - priv->children[child_index]->name); - local->op_ret = -1; - - } else if (!child_went_down (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop failed on child %s: %s", - priv->children[child_index]->name, - strerror (op_errno)); - } - local->op_errno = op_errno; - break; - } - - call_count = --local->call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOTSUP)) { - local->transaction.resume (frame, this); - } else { - afr_transaction_perform_fop (frame, this); - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int failed_count = 0; + + priv = this->private; + local = frame->local; + + if (priv->thin_arbiter_count) { + failed_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + if (failed_count == 1) { + afr_handle_failure_using_thin_arbiter(frame, this); + return 0; + } else { + /* Txn either succeeded or failed on both data bricks. Let + * post_op_do handle it as the case might be. */ } + } - return 0; + afr_changelog_post_op_do(frame, this); + return 0; } -int -afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = this->private; - int i = 0; - int ret = 0; - int call_count = 0; - dict_t **xattr = NULL; - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - int piggyback = 0; - afr_internal_lock_t *int_lock = NULL; - unsigned char *locked_nodes = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; + + local = frame->local; + priv = this->private; + ctx = local->inode_ctx; + + type = afr_index_for_transaction_type(local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + + if (local->transaction.no_uninherit) + return _gf_false; - local = frame->local; - int_lock = &local->internal_lock; + /* This function must be idempotent. So check if we + were called before and return the same answer again. - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); + It is important to keep this function idempotent for + the call in afr_changelog_post_op_safe() to not have + side effects on the call from afr_changelog_post_op_now() + */ + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + LOCK(&local->inode->lock); + { for (i = 0; i < priv->child_count; i++) { - xattr[i] = dict_new (); + if (local->transaction.pre_op[i] != ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } } - call_count = afr_changelog_pre_op_call_count (local->transaction.type, - int_lock, - priv->child_count); - if (call_count == 0) { - local->internal_lock.lock_cbk = - local->transaction.done; - afr_unlock (frame, this); - goto out; + if (ctx->inherited[type]) { + ret = _gf_true; + ctx->inherited[type]--; + } else if (ctx->on_disk[type]) { + ret = _gf_false; + ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; } - local->call_count = call_count; - - __mark_all_pending (local->pending, priv->child_count, - local->transaction.type); - - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); - - locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); - for (i = 0; i < priv->child_count; i++) { - if (!locked_nodes[i]) - continue; - ret = afr_set_pending_dict (priv, xattr[i], local->pending, - i, LOCAL_FIRST); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - break; - } - - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_done[i]) { - fdctx->pre_op_piggyback[i]++; - piggyback = 1; - fdctx->hit++; - } else { - fdctx->miss++; - } - } - UNLOCK (&local->fd->lock); - - afr_set_delayed_post_op (frame, this); - - if (piggyback) - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - case AFR_METADATA_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } - - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - - call_count--; - } + if (!ctx->inherited[type] && !ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + ctx->pre_op_done[type][i] = 0; + } + } +unlock: + UNLOCK(&local->inode->lock); + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + return ret; +} - ret = afr_set_pending_dict (priv, xattr[i], local->pending, - i, LOCAL_FIRST); +gf_boolean_t +afr_changelog_pre_op_inherit(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + local = frame->local; + priv = this->private; - /* fall through */ + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; - case AFR_ENTRY_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } - - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - } + type = afr_index_for_transaction_type(local->transaction.type); - if (!--call_count) - break; + LOCK(&local->inode->lock); + { + if (!local->inode_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; } -out: + for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); + if (local->transaction.pre_op[i] != + local->inode_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } } - return 0; -} + local->inode_ctx->inherited[type]++; + ret = _gf_true; -int -afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Blocking inodelks failed."); - local->transaction.done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Blocking inodelks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } + local->transaction.inherited = _gf_true; + } +unlock: + UNLOCK(&local->inode->lock); - return 0; + return ret; } - -int -afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; + priv = this->private; - /* Initiate blocking locks if non-blocking has failed */ - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking inodelks failed. Proceeding to blocking"); - int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; - afr_blocking_lock (frame, this); - } else { + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) + return _gf_false; - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking inodelks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ + return _gf_false; - return 0; -} + if (!local->transaction.dirtied) + return _gf_false; + if (!afr_txn_nothing_failed(frame, this)) + return _gf_false; -int -afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + type = afr_index_for_transaction_type(local->transaction.type); - local = frame->local; - int_lock = &local->internal_lock; + ret = _gf_false; - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Blocking entrylks failed."); - local->transaction.done (frame, this); + LOCK(&local->inode->lock); + { + if (!local->inode_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + local->inode_ctx->pre_op_done[type][i] = + (!local->transaction.failed_subvols[i]); } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); + for (i = 0; i < priv->child_count; i++) + if (local->inode_ctx->pre_op_done[type][i] != + (!local->transaction.failed_subvols[i])) { + local->transaction.no_uninherit = 1; + goto unlock; + } } + local->inode_ctx->on_disk[type]++; - return 0; -} + ret = _gf_true; + } +unlock: + UNLOCK(&local->inode->lock); + return ret; +} int -afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +afr_changelog_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; + child_index = (long)cookie; - /* Initiate blocking locks if non-blocking has failed */ - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks failed. Proceeding to blocking"); - int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; - afr_blocking_lock (frame, this); - } else { + if (op_ret == -1) { + local->op_errno = op_errno; + afr_transaction_fop_failed(frame, this, child_index); + } - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } + if (xattr) + local->transaction.changelog_xdata[child_index] = dict_ref(xattr); - return 0; -} + call_count = afr_frame_return(frame); + if (call_count == 0) { + local->transaction.changelog_resume(frame, this); + } -int -afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; + return 0; +} - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Blocking entrylks failed."); - local->transaction.done (frame, this); - } else { +void +afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op, + dict_t **xdata, dict_t **newloc_xdata) +{ + int i = 0; + int ret = 0; + char *key = NULL; + int keylen = 0; + const char *name = NULL; + dict_t *xdata1 = NULL; + dict_t *xdata2 = NULL; + xlator_t *this = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t need_entry_key_set = _gf_true; + + local = frame->local; + this = THIS; + priv = this->private; + + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) + goto out; + + if (!priv->esh_granular) + goto out; + + xdata1 = dict_new(); + if (!xdata1) + goto out; + + name = local->loc.name; + if (local->op == GF_FOP_LINK) + name = local->newloc.name; + + switch (op) { + case AFR_TRANSACTION_PRE_OP: + key = GF_XATTROP_ENTRY_IN_KEY; + break; + case AFR_TRANSACTION_POST_OP: + if (afr_txn_nothing_failed(frame, this)) { + key = GF_XATTROP_ENTRY_OUT_KEY; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) + continue; + need_entry_key_set = _gf_false; + break; + } + /* If the transaction itself did not fail and there + * are no failed subvolumes, check whether the fop + * failed due to a symmetric error. If it did, do + * not set the ENTRY_OUT xattr which would end up + * deleting a name index which was created possibly by + * an earlier entry txn that may have failed on some + * of the sub-volumes. + */ + if (local->op_ret) + need_entry_key_set = _gf_false; + } else { + key = GF_XATTROP_ENTRY_IN_KEY; + } + break; + } + + if (need_entry_key_set) { + keylen = strlen(key); + ret = dict_set_strn(xdata1, key, keylen, (char *)name); + if (ret) + gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "%s/%s: Could not set %s key during xattrop", + uuid_utoa(local->loc.pargfid), local->loc.name, key); + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + xdata2 = dict_new(); + if (!xdata2) + goto out; - gf_log (this->name, GF_LOG_DEBUG, - "Blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); + ret = dict_set_strn(xdata2, key, keylen, + (char *)local->newloc.name); + if (ret) + gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "%s/%s: Could not set %s key during " + "xattrop", + uuid_utoa(local->newloc.pargfid), local->newloc.name, + key); } - return 0; -} + } + *xdata = xdata1; + *newloc_xdata = xdata2; + xdata1 = xdata2 = NULL; +out: + if (xdata1) + dict_unref(xdata1); + return; +} int -afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this) +afr_changelog_prepare(xlator_t *this, call_frame_t *frame, int *call_count, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op, dict_t **xdata, + dict_t **newloc_xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; + priv = this->private; - GF_ASSERT (!int_lock->higher_locked); + *call_count = afr_changelog_call_count( + local->transaction.type, local->transaction.pre_op, + local->transaction.failed_subvols, priv->child_count); - int_lock->lock_cbk = afr_post_blocking_rename_cbk; - afr_blocking_lock (frame, this); + if (*call_count == 0) { + changelog_resume(frame, this); + return -1; + } - return 0; -} + afr_changelog_populate_xdata(frame, op, xdata, newloc_xdata); + local->call_count = *call_count; + local->transaction.changelog_resume = changelog_resume; + return 0; +} int -afr_set_transaction_flock (afr_local_t *local) +afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - - int_lock = &local->internal_lock; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + dict_t *newloc_xdata = NULL; + int i = 0; + int call_count = 0; + int ret = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.changelog_xdata[i]) { + dict_unref(local->transaction.changelog_xdata[i]); + local->transaction.changelog_xdata[i] = NULL; + } + } - inodelk->flock.l_len = local->transaction.len; - inodelk->flock.l_start = local->transaction.start; - inodelk->flock.l_type = F_WRLCK; + ret = afr_changelog_prepare(this, frame, &call_count, changelog_resume, op, + &xdata, &newloc_xdata); + if (ret) return 0; -} - -int -afr_lock_rec (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - int_lock->transaction_lk_type = AFR_TRANSACTION_LK; - int_lock->domain = this->name; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i] || + local->transaction.failed_subvols[i]) + continue; switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - afr_set_transaction_flock (local); - - int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; - - afr_nonblocking_inodelk (frame, this); + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + if (!local->fd) { + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->xattrop, + &local->loc, GF_XATTROP_ADD_ARRAY, xattr, xdata); + } else { + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fxattrop, + local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata); + } break; + case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: + STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, newloc_xdata); + call_count--; - int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; - afr_nonblocking_entrylk (frame, this); - break; + /* fall through */ - case AFR_ENTRY_TRANSACTION: - int_lock->lk_basename = local->transaction.basename; - if (&local->transaction.parent_loc) - int_lock->lk_loc = &local->transaction.parent_loc; + case AFR_ENTRY_TRANSACTION: + if (local->fd) + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fxattrop, + local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata); else - GF_ASSERT (local->fd); - - int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; - afr_nonblocking_entrylk (frame, this); + STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, xdata); break; } - return 0; -} + if (!--call_count) + break; + } + if (xdata) + dict_unref(xdata); + if (newloc_xdata) + dict_unref(newloc_xdata); + return 0; +} -int -afr_lock (call_frame_t *frame, xlator_t *this) +static void +afr_init_optimistic_changelog_for_txn(xlator_t *this, afr_local_t *local) { - afr_set_lock_number (frame, this); + int locked_count = 0; + afr_private_t *priv = NULL; - return afr_lock_rec (frame, this); -} + priv = this->private; + locked_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); + if (priv->optimistic_change_log && locked_count == priv->child_count) + local->optimistic_change_log = 1; -/* }}} */ + return; +} int -afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +afr_changelog_pre_op(call_frame_t *frame, xlator_t *this) { - if (__fop_changelog_needed (frame, this)) { - afr_changelog_pre_op (frame, this); + afr_private_t *priv = this->private; + int i = 0; + int ret = 0; + int call_count = 0; + int op_errno = 0; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + unsigned char *locked_nodes = NULL; + int idx = -1; + gf_boolean_t pre_nop = _gf_true; + dict_t *xdata_req = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + idx = afr_index_for_transaction_type(local->transaction.type); + + locked_nodes = afr_locked_nodes_get(local->transaction.type, int_lock); + + for (i = 0; i < priv->child_count; i++) { + if (locked_nodes[i]) { + local->transaction.pre_op[i] = 1; + call_count++; } else { - afr_transaction_perform_fop (frame, this); + local->transaction.failed_subvols[i] = 1; + } + } + + afr_init_optimistic_changelog_for_txn(this, local); + + if (afr_changelog_pre_op_inherit(frame, this)) + goto next; + + /* This condition should not be met with present code, as + * transaction.done will be called if locks are not acquired on even a + * single node. + */ + if (call_count == 0) { + op_errno = ENOTCONN; + goto err; + } + + /* Check if the fop can be performed on at least + * quorum number of nodes. + */ + if (priv->quorum_count && !afr_has_fop_quorum(frame)) { + op_errno = int_lock->lock_op_errno; + if (op_errno == 0) + op_errno = afr_quorum_errno(priv); + goto err; + } + + xdata_req = dict_new(); + if (!xdata_req) { + op_errno = ENOMEM; + goto err; + } + + if (call_count < priv->child_count) + pre_nop = _gf_false; + + /* Set an all-zero pending changelog so that in the cbk, we can get the + * current on-disk values. In a replica 3 volume with arbiter enabled, + * these values are needed to arrive at a go/ no-go of the fop phase to + * avoid ending up in split-brain.*/ + + ret = afr_set_pending_dict(priv, xdata_req, local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } + + if (afr_needs_changelog_update(local)) { + local->dirty[idx] = hton32(1); + + ret = dict_set_static_bin(xdata_req, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + op_errno = ENOMEM; + goto err; } - return 0; -} + pre_nop = _gf_false; + local->transaction.dirtied = 1; + } + if (pre_nop) + goto next; -void -afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + if (!local->pre_op_compat) { + dict_copy(xdata_req, local->xdata_req); + goto next; + } - /* call this function from any of the related optimizations - which benefit from delaying post op are enabled, namely: + afr_changelog_do(frame, this, xdata_req, afr_transaction_perform_fop, + AFR_TRANSACTION_PRE_OP); - - changelog piggybacking - - eager locking - */ + if (xdata_req) + dict_unref(xdata_req); - priv = this->private; - if (!priv) - return; + return 0; +next: + afr_transaction_perform_fop(frame, this); - if (!priv->post_op_delay_secs) - return; + if (xdata_req) + dict_unref(xdata_req); - local = frame->local; - if (!local->transaction.eager_lock_on) - return; + return 0; +err: + local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = op_errno; - if (!local) - return; + afr_handle_lock_acquire_failure(local); - if (!local->fd) - return; + if (xdata_req) + dict_unref(xdata_req); - if (local->op == GF_FOP_WRITE) - local->delayed_post_op = _gf_true; + return 0; } -gf_boolean_t -afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +int +afr_post_nonblocking_lock_cbk(call_frame_t *frame, xlator_t *this) { - afr_inode_ctx_t *ictx = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + /* Initiate blocking locks if non-blocking has failed */ + if (int_lock->lock_op_ret < 0) { + gf_msg_debug(this->name, 0, + "Non blocking locks failed. Proceeding to blocking"); + int_lock->lock_cbk = afr_internal_lock_finish; + afr_blocking_lock(frame, this); + } else { + gf_msg_debug(this->name, 0, + "Non blocking locks done. Proceeding to FOP"); + + afr_internal_lock_finish(frame, this); + } + + return 0; +} - if (!inode) { - /* If false is returned, it may keep on taking eager-lock - * which may lead to starvation, so return true to avoid that. - */ - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); - return _gf_true; - } - /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock - * is taken mount2 opened the same file, it won't be able to - * perform any data operations until mount1 releases eager-lock. - * To avoid such scenario do not enable eager-lock for this transaction - * if open-fd-count is > 1 - */ +int +afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - ictx = afr_inode_ctx_get (inode, this); - if (!ictx) - return _gf_true; + local = frame->local; + int_lock = &local->internal_lock; - if (ictx->open_fd_count > 1) - return _gf_true; + if (int_lock->lock_op_ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INTERNAL_LKS_FAILED, + "Blocking entrylks failed."); - return _gf_false; + afr_transaction_done(frame, this); + } else { + gf_msg_debug(this->name, 0, + "Blocking entrylks done. Proceeding to FOP"); + + afr_internal_lock_finish(frame, this); + } + return 0; } -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) +int +afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this) { - if (local->success_count != priv->child_count) - return _gf_true; - return _gf_false; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + GF_ASSERT(!int_lock->higher_locked); + + int_lock->lock_cbk = afr_post_blocking_rename_cbk; + afr_blocking_lock(frame, this); + + return 0; } -gf_boolean_t -is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +int +afr_set_transaction_flock(xlator_t *this, afr_local_t *local, + afr_lockee_t *lockee) { - afr_local_t *local = NULL; - gf_boolean_t res = _gf_false; - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + struct gf_flock *flock = NULL; + + priv = this->private; + flock = &lockee->flock; + + if ((priv->arbiter_count || local->transaction.eager_lock_on || + priv->full_lock) && + local->transaction.type == AFR_DATA_TRANSACTION) { + /*Lock entire file to avoid network split brains.*/ + flock->l_len = 0; + flock->l_start = 0; + } else { + flock->l_len = local->transaction.len; + flock->l_start = local->transaction.start; + } + flock->l_type = F_WRLCK; + + return 0; +} - priv = this->private; +int +afr_lock(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + int i = 0; - local = frame->local; - if (!local) - goto out; + local = frame->local; + int_lock = &local->internal_lock; - if (!local->delayed_post_op) - goto out; + int_lock->lock_cbk = afr_post_nonblocking_lock_cbk; + int_lock->domain = this->name; - //Mark pending changelog ASAP - if (afr_any_fops_failed (local, priv)) - goto out; + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + for (i = 0; i < int_lock->lockee_count; i++) { + afr_set_transaction_flock(this, local, &int_lock->lockee[i]); + } - if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) - goto out; + break; - res = _gf_true; -out: - return res; -} + case AFR_ENTRY_TRANSACTION: + int_lock->lk_basename = local->transaction.basename; + if (local->transaction.parent_loc.path) + int_lock->lk_loc = &local->transaction.parent_loc; + else + GF_ASSERT(local->fd); + break; + case AFR_ENTRY_RENAME_TRANSACTION: + break; + } + afr_lock_nonblocking(frame, this); + return 0; +} -void -afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub); +static gf_boolean_t +afr_locals_overlap(afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} -void -afr_delayed_changelog_wake_up_cbk (void *data) +gf_boolean_t +afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check) { - fd_t *fd = NULL; + afr_local_t *each = NULL; + afr_lock_t *lock = NULL; + + lock = &local->inode_ctx->lock[local->transaction.type]; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + list_for_each_entry(each, &lock->owners, transaction.owner_list) + { + if (afr_locals_overlap(each, local)) { + return _gf_true; + } + } - fd = data; + if (!waitlist_check) + return _gf_false; + list_for_each_entry(each, &lock->waiting, transaction.wait_list) + { + if (afr_locals_overlap(each, local)) { + return _gf_true; + } + } + return _gf_false; +} - afr_delayed_changelog_wake_up (THIS, fd); +/* }}} */ +static void +afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src, + xlator_t *this, int lockee_num) +{ + afr_private_t *priv = this->private; + afr_lockee_t *sl = &src->lockee[lockee_num]; + afr_lockee_t *dl = &dst->lockee[lockee_num]; + + dst->domain = src->domain; + dl->flock.l_len = sl->flock.l_len; + dl->flock.l_start = sl->flock.l_start; + dl->flock.l_type = sl->flock.l_type; + dl->locked_count = sl->locked_count; + memcpy(dl->locked_nodes, sl->locked_nodes, + priv->child_count * sizeof(*dl->locked_nodes)); } +void +__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared) +{ + gf_boolean_t conflict = _gf_false; + afr_local_t *each = NULL; + afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type]; + + while (!conflict) { + if (list_empty(&lock->waiting)) + return; + each = list_entry(lock->waiting.next, afr_local_t, + transaction.wait_list); + if (afr_has_lock_conflict(each, _gf_false)) { + conflict = _gf_true; + } + if (conflict && !list_empty(&lock->owners)) + return; + afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock, + each->transaction.frame->this, 0); + list_move_tail(&each->transaction.wait_list, shared); + list_add_tail(&each->transaction.owner_list, &lock->owners); + } +} -/* - Check if the frame is destined to get optimized away - with changelog piggybacking -*/ -static gf_boolean_t -is_piggyback_post_op (call_frame_t *frame, fd_t *fd) +static void +afr_lock_resume_shared(struct list_head *list) { - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - gf_boolean_t piggyback = _gf_true; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *each = NULL; + + while (!list_empty(list)) { + each = list_entry(list->next, afr_local_t, transaction.wait_list); + list_del_init(&each->transaction.wait_list); + afr_changelog_pre_op(each->transaction.frame, + each->transaction.frame->this); + } +} - priv = frame->this->private; - local = frame->local; - fdctx = afr_fd_ctx_get (fd, frame->this); +int +afr_internal_lock_finish(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + afr_lock_t *lock = NULL; + + local->internal_lock.lock_cbk = NULL; + if (!local->transaction.eager_lock_on) { + if (local->internal_lock.lock_op_ret < 0) { + afr_transaction_done(frame, this); + return 0; + } + afr_changelog_pre_op(frame, this); + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + if (local->internal_lock.lock_op_ret < 0) { + afr_handle_lock_acquire_failure(local); + } else { + lock->event_generation = local->event_generation; + afr_changelog_pre_op(frame, this); + } + } - LOCK(&fd->lock); - { - piggyback = _gf_true; + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - local->transaction.postop_piggybacked[i] = 1; - } else { - /* For at least _one_ subvolume we cannot - piggyback on the changelog, and have to - perform a hard POST-OP and therefore fsync - if necesssary - */ - piggyback = _gf_false; - GF_ASSERT (fdctx->pre_op_done[i]); - fdctx->pre_op_done[i]--; - } - } +gf_boolean_t +afr_are_conflicting_ops_waiting(afr_local_t *local, xlator_t *this) +{ + afr_lock_t *lock = NULL; + lock = &local->inode_ctx->lock[local->transaction.type]; + + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any {meta,}data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 for metadata transactions and if num-inodelks > 1 + * for data transactions + */ + + if (local->transaction.type == AFR_METADATA_TRANSACTION) { + if (local->inode_ctx->open_fd_count > 1) { + return _gf_true; } - UNLOCK(&fd->lock); - - if (!afr_txn_nothing_failed (frame, frame->this)) { - /* something failed in this transaction, - we will be performing a hard post-op - */ - return _gf_false; + } else if (local->transaction.type == AFR_DATA_TRANSACTION) { + if (lock->num_inodelks > 1) { + return _gf_true; } + } - return piggyback; + return _gf_false; } +gf_boolean_t +afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this, + int delay) +{ + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + gf_boolean_t res = _gf_false; + + local = frame->local; + lock = &local->inode_ctx->lock[local->transaction.type]; + + if (!afr_txn_nothing_failed(frame, this)) { + lock->release = _gf_true; + goto out; + } + + if (afr_are_conflicting_ops_waiting(local, this)) { + lock->release = _gf_true; + goto out; + } + + if (!list_empty(&lock->owners)) + goto out; + else + GF_ASSERT(list_empty(&lock->waiting)); + + if (lock->release) { + goto out; + } + + if (!delay) { + goto out; + } + + if (local->transaction.disable_delayed_post_op) { + goto out; + } + + if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) && + (local->op != GF_FOP_FSYNC)) { + /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so + * they are fine too*/ + goto out; + } + + res = _gf_true; +out: + return res; +} + +void +afr_delayed_changelog_wake_up_cbk(void *data) +{ + afr_lock_t *lock = NULL; + afr_local_t *local = data; + afr_local_t *timer_local = NULL; + struct list_head shared; + + INIT_LIST_HEAD(&shared); + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + if (list_empty(&lock->owners) && (local == timer_local)) { + GF_ASSERT(list_empty(&lock->waiting)); + /*Last owner*/ + lock->release = _gf_true; + lock->delay_timer = NULL; + } + } + UNLOCK(&local->inode->lock); + afr_changelog_post_op_now(local->transaction.frame, + local->transaction.frame->this); +} /* SET operation */ int -afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local) { - afr_fd_ctx_t *fdctx = NULL; - - fdctx = afr_fd_ctx_get (fd, this); + LOCK(&local->inode->lock); + { + local->inode_ctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&local->inode->lock); - LOCK(&fd->lock); - { - fdctx->witnessed_unstable_write = _gf_true; - } - UNLOCK(&fd->lock); - - return 0; + return 0; } /* TEST and CLEAR operation */ gf_boolean_t -afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode) { - afr_fd_ctx_t *fdctx = NULL; - gf_boolean_t witness = _gf_false; + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t witness = _gf_false; - fdctx = afr_fd_ctx_get (fd, this); - if (!fdctx) - return _gf_true; + LOCK(&inode->lock); + { + (void)__afr_inode_ctx_get(this, inode, &ctx); - LOCK(&fd->lock); - { - if (fdctx->witnessed_unstable_write) { - witness = _gf_true; - fdctx->witnessed_unstable_write = _gf_false; - } + if (ctx->witnessed_unstable_write) { + witness = _gf_true; + ctx->witnessed_unstable_write = _gf_false; } - UNLOCK (&fd->lock); + } + UNLOCK(&inode->lock); - return witness; + return witness; } - int -afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) -{ - afr_private_t *priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (afr_fop_failed (op_ret, op_errno)) { - /* Failure of fsync() is as good as failure of previous - write(). So treat it like one. - */ - gf_log (this->name, GF_LOG_WARNING, - "fsync(%s) failed on subvolume %s. Transaction was %s", - uuid_utoa (local->fd->inode->gfid), - priv->children[child_index]->name, - gf_fop_list[local->op]); - - afr_transaction_fop_failed (frame, this, child_index); - } +afr_changelog_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_local_t *local = NULL; - call_count = afr_frame_return (frame); + priv = this->private; + local = frame->local; - if (call_count == 0) - afr_changelog_post_op_now (frame, this); + if (op_ret != 0) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_msg(this->name, GF_LOG_WARNING, op_errno, AFR_MSG_FSYNC_FAILED, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, gf_fop_list[local->op]); - return 0; -} + afr_transaction_fop_failed(frame, this, child_index); + } + + call_count = afr_frame_return(frame); + if (call_count == 0) + afr_changelog_post_op_now(frame, this); + + return 0; +} int -afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +afr_changelog_fsync(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xdata = NULL; - GF_UNUSED int ret = -1; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + call_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); - if (!call_count) { - /* will go straight to unlock */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now(frame, this); + return 0; + } - local->call_count = call_count; + local->call_count = call_count; - xdata = dict_new(); - if (xdata) - ret = dict_set_int32 (xdata, "batch-fsync", 1); + xdata = dict_new(); + if (xdata) { + ret = dict_set_int32_sizen(xdata, "batch-fsync", 1); + ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + } - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; - STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, local->fd, - 1, xdata); - if (!--call_count) - break; - } + STACK_WIND_COOKIE(frame, afr_changelog_fsync_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fsync, + local->fd, 1, xdata); + if (!--call_count) + break; + } - if (xdata) - dict_unref (xdata); + if (xdata) + dict_unref(xdata); - return 0; + return 0; } - - int -afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +int +afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; - - if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { - afr_changelog_post_op_now (frame, this); - return 0; - } + local = frame->local; + priv = this->private; - if (is_piggyback_post_op (frame, local->fd)) { - /* just detected that this post-op is about to - be optimized away as a new write() has - already piggybacked on this frame's changelog. - */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now(frame, this); + return 0; + } - /* Calling afr_changelog_post_op_now() now will result in - issuing ->[f]xattrop(). - - Performing a hard POST-OP (->[f]xattrop() FOP) is a more - responsible operation that what it might appear on the surface. - - The changelog of a file (in the xattr of the file on the server) - stores information (pending count) about the state of the file - on the OTHER server. This changelog is blindly trusted, and must - therefore be updated in such a way it remains trustworthy. This - implies that decrementing the pending count (essentially "clearing - the dirty flag") must be done STRICTLY after we are sure that the - operation on the other server has reached stable storage. - - While the backend filesystem on that server will eventually flush - it to stable storage, we (being in userspace) have no mechanism - to get notified when the write became "stable". - - This means we need take matter into our own hands and issue an - fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, - and get an acknowledgement for it. And we need to wait for the - fsync() acknowledgement before initiating the hard POST-OP. - - However if the FD itself was opened in O_SYNC or O_DSYNC then - we are already guaranteed that the writes were made stable as - part of the FOP itself. The same holds true for NFS stable - writes which happen on an anonymous FD with O_DSYNC or O_SYNC - flag set in the writev() @flags param. For all other write types, - mark a flag in the fdctx whenever an unstable write is witnessed. + if (afr_changelog_pre_op_uninherit(frame, this) && + afr_txn_nothing_failed(frame, this)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. */ - - if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { - afr_changelog_post_op_now (frame, this); - return 0; - } - - /* Check whether users want durability and perform fsync/post-op - * accordingly. - */ - if (priv->ensure_durability) { - /* Time to fsync() */ - afr_changelog_fsync (frame, this); - } else { - afr_changelog_post_op_now (frame, this); - } - + afr_changelog_post_op_now(frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write(this, local->inode)) { + afr_changelog_post_op_now(frame, this); return 0; + } + + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync(frame, this); + } else { + afr_changelog_post_op_now(frame, this); + } + + return 0; } - void -afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub) +afr_changelog_post_op(call_frame_t *frame, xlator_t *this) { - afr_fd_ctx_t *fd_ctx = NULL; - call_frame_t *prev_frame = NULL; - struct timeval delta = {0, }; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; + struct timespec delta = { + 0, + }; + afr_private_t *priv = NULL; + afr_local_t *local = frame->local; + afr_lock_t *lock = NULL; + gf_boolean_t post_op = _gf_true; + struct list_head shared; + + priv = this->private; + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + + INIT_LIST_HEAD(&shared); + if (!local->transaction.eager_lock_on) + goto out; + + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + list_del_init(&local->transaction.owner_list); + list_add(&local->transaction.owner_list, &lock->post_op); + __afr_transaction_wake_shared(local, &shared); + + if (!afr_is_delayed_changelog_post_op_needed(frame, this, + delta.tv_sec)) { + if (list_empty(&lock->owners)) + lock->release = _gf_true; + goto unlock; + } - delta.tv_sec = priv->post_op_delay_secs; - delta.tv_usec = 0; - - pthread_mutex_lock (&fd_ctx->delay_lock); - { - prev_frame = fd_ctx->delay_frame; - fd_ctx->delay_frame = NULL; - if (fd_ctx->delay_timer) - gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); - fd_ctx->delay_timer = NULL; - if (!frame) - goto unlock; - fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, - afr_delayed_changelog_wake_up_cbk, - fd); - fd_ctx->delay_frame = frame; - } + GF_ASSERT(lock->delay_timer == NULL); + lock->delay_timer = gf_timer_call_after( + this->ctx, delta, afr_delayed_changelog_wake_up_cbk, local); + if (!lock->delay_timer) { + lock->release = _gf_true; + } else { + post_op = _gf_false; + } + } unlock: - pthread_mutex_unlock (&fd_ctx->delay_lock); + UNLOCK(&local->inode->lock); + + if (!list_empty(&shared)) { + afr_lock_resume_shared(&shared); + } out: - if (prev_frame) { - local = prev_frame->local; - local->transaction.resume_stub = stub; - afr_changelog_post_op_safe (prev_frame, this); - } else if (stub) { - call_resume (stub); - } + if (post_op) { + if (!local->transaction.eager_lock_on || lock->release) { + afr_changelog_post_op_safe(frame, this); + } else { + afr_changelog_post_op_now(frame, this); + } + } } - -void -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +int +afr_transaction_resume(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (is_afr_delayed_changelog_post_op_needed (frame, this)) - afr_delayed_changelog_post_op (this, frame, local->fd, NULL); - else - afr_changelog_post_op_safe (frame, this); -} + afr_restore_lk_owner(frame); + afr_handle_symmetric_errors(frame, this); + if (!local->pre_op_compat) + /* new mode, pre-op was done along + with OP */ + afr_changelog_pre_op_update(frame, this); -/* Wake up the sleeping/delayed post-op, and also register - a stub to have it resumed after this transaction - completely finishes. + afr_changelog_post_op(frame, this); - The @stub gets saved in @local and gets resumed in - afr_local_cleanup() - */ -void -afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) -{ - afr_delayed_changelog_post_op (this, NULL, fd, stub); + return 0; } +/** + * afr_transaction_fop_failed - inform that an fop failed + */ void -afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, int child_index) { - afr_delayed_changelog_post_op (this, NULL, fd, NULL); -} + afr_local_t *local = NULL; + + local = frame->local; + local->transaction.failed_subvols[child_index] = 1; +} - int -afr_transaction_resume (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +__need_previous_lock_unlocked(afr_local_t *local) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_lock_t *lock = NULL; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; + lock = &local->inode_ctx->lock[local->transaction.type]; + if (!lock->acquired) + return _gf_false; + if (lock->acquired && lock->event_generation != local->event_generation) + return _gf_true; + return _gf_false; +} - if (local->transaction.eager_lock_on) { - /* We don't need to retain "local" in the - fd list anymore, writes to all subvols - are finished by now */ - LOCK (&local->fd->lock); - { - list_del_init (&local->transaction.eager_locked); - } - UNLOCK (&local->fd->lock); +void +__afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock, + gf_boolean_t *do_pre_op, afr_local_t **timer_local) +{ + afr_lock_t *lock = NULL; + afr_local_t *owner_local = NULL; + xlator_t *this = local->transaction.frame->this; + + local->transaction.eager_lock_on = _gf_true; + afr_set_lk_owner(local->transaction.frame, this, local->inode); + + lock = &local->inode_ctx->lock[local->transaction.type]; + if (__need_previous_lock_unlocked(local)) { + if (!list_empty(&lock->owners)) { + lock->release = _gf_true; + } else if (lock->delay_timer) { + lock->release = _gf_true; + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + /* It will be put in frozen list + * in the code flow below*/ + } else { + *timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + lock->delay_timer = NULL; + } } - - afr_restore_lk_owner (frame); - - if (__fop_changelog_needed (frame, this)) { - afr_changelog_post_op (frame, this); + } + + if (lock->release) { + list_add_tail(&local->transaction.wait_list, &lock->frozen); + *take_lock = _gf_false; + goto out; + } + + if (lock->delay_timer) { + *take_lock = _gf_false; + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + list_add_tail(&local->transaction.wait_list, &lock->frozen); } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } + *timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + afr_copy_inodelk_vars(&local->internal_lock, + &(*timer_local)->internal_lock, this, 0); + lock->delay_timer = NULL; + *do_pre_op = _gf_true; + list_add_tail(&local->transaction.owner_list, &lock->owners); } + goto out; + } - return 0; + if (!list_empty(&lock->owners)) { + if (!lock->acquired || afr_has_lock_conflict(local, _gf_true)) { + list_add_tail(&local->transaction.wait_list, &lock->waiting); + *take_lock = _gf_false; + goto out; + } + owner_local = list_entry(lock->owners.next, afr_local_t, + transaction.owner_list); + afr_copy_inodelk_vars(&local->internal_lock, + &owner_local->internal_lock, this, 0); + *take_lock = _gf_false; + *do_pre_op = _gf_true; + } + + if (lock->acquired) + GF_ASSERT(!(*take_lock)); + list_add_tail(&local->transaction.owner_list, &lock->owners); +out: + return; } - -/** - * afr_transaction_fop_failed - inform that an fop failed - */ - void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, - int child_index) +afr_transaction_start(afr_local_t *local, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - local = frame->local; - priv = this->private; - - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + afr_private_t *priv = NULL; + gf_boolean_t take_lock = _gf_true; + gf_boolean_t do_pre_op = _gf_false; + afr_local_t *timer_local = NULL; + + priv = this->private; + + if (local->transaction.type != AFR_DATA_TRANSACTION && + local->transaction.type != AFR_METADATA_TRANSACTION) + goto lock_phase; + + if (!priv->eager_lock) + goto lock_phase; + + LOCK(&local->inode->lock); + { + __afr_eager_lock_handle(local, &take_lock, &do_pre_op, &timer_local); + } + UNLOCK(&local->inode->lock); +lock_phase: + if (!local->transaction.eager_lock_on) { + afr_set_lk_owner(local->transaction.frame, this, + local->transaction.frame->root); + } + + if (take_lock) { + afr_lock(local->transaction.frame, this); + } else if (do_pre_op) { + afr_changelog_pre_op(local->transaction.frame, this); + } + /*Always call delayed_changelog_wake_up_cbk after calling pre-op above + * so that any inheriting can happen*/ + if (timer_local) + afr_delayed_changelog_wake_up_cbk(timer_local); } - - - static gf_boolean_t -afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +int +afr_write_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) { - uint64_t start1 = local1->transaction.start; - uint64_t start2 = local2->transaction.start; - uint64_t end1 = 0; - uint64_t end2 = 0; - - if (local1->transaction.len) - end1 = start1 + local1->transaction.len - 1; - else - end1 = ULLONG_MAX; - - if (local2->transaction.len) - end2 = start2 + local2->transaction.len - 1; - else - end2 = ULLONG_MAX; - - return ((end1 >= start2) && (end2 >= start1)); + afr_local_t *local = frame->local; + + if (err) { + AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err); + goto fail; + } + + afr_transaction_start(local, this); + return 0; +fail: + local->transaction.unwind(frame, this); + AFR_STACK_DESTROY(frame); + return 0; } -void -afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +int +afr_transaction_lockee_init(call_frame_t *frame) { - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *each = NULL; - - priv = this->private; + afr_local_t *local = frame->local; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_private_t *priv = frame->this->private; + int ret = 0; - if (!local->fd) - return; - - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; - - if (!priv->eager_lock) - return; - - fdctx = afr_fd_ctx_get (local->fd, this); - if (!fdctx) - return; + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + ret = afr_add_inode_lockee(local, priv->child_count); + break; - if (afr_are_multiple_fds_opened (local->fd->inode, this)) - return; - /* - * Once full file lock is acquired in eager-lock phase, overlapping - * writes do not compete for inode-locks, instead are transferred to the - * next writes. Because of this overlapping writes are not ordered. - * This can cause inconsistencies in replication. - * Example: - * Two overlapping writes w1, w2 are sent in parallel on same fd - * in two threads t1, t2. - * Both threads can execute afr_writev_wind in the following manner. - * t1 winds w1 on brick-0 - * t2 winds w2 on brick-0 - * t2 winds w2 on brick-1 - * t1 winds w1 on brick-1 - * - * This check makes sure the locks are not transferred for - * overlapping writes. - */ - LOCK (&local->fd->lock); - { - list_for_each_entry (each, &fdctx->eager_locked, - transaction.eager_locked) { - if (afr_locals_overlap (each, local)) { - local->transaction.eager_lock_on = _gf_false; - goto unlock; - } + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = afr_add_entry_lockee(local, &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) { + goto out; + } + if (local->op == GF_FOP_RENAME) { + ret = afr_add_entry_lockee( + local, &local->transaction.new_parent_loc, + local->transaction.new_basename, priv->child_count); + if (ret) { + goto out; } - local->transaction.eager_lock_on = _gf_true; - list_add_tail (&local->transaction.eager_locked, - &fdctx->eager_locked); - } -unlock: - UNLOCK (&local->fd->lock); + if (local->newloc.inode && + IA_ISDIR(local->newloc.inode->ia_type)) { + ret = afr_add_entry_lockee(local, &local->newloc, NULL, + priv->child_count); + if (ret) { + goto out; + } + } + } else if (local->op == GF_FOP_RMDIR) { + ret = afr_add_entry_lockee(local, &local->loc, NULL, + priv->child_count); + if (ret) { + goto out; + } + } + + if (int_lock->lockee_count > 1) { + qsort(int_lock->lockee, int_lock->lockee_count, + sizeof(*int_lock->lockee), afr_entry_lockee_cmp); + } + break; + } +out: + return ret; } - int -afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - fd_t *fd = NULL; - int ret = -1; - - local = frame->local; - priv = this->private; - - local->transaction.resume = afr_transaction_resume; - local->transaction.type = type; - - ret = afr_transaction_local_init (local, this); - if (ret < 0) - goto out; - - afr_transaction_eager_lock_init (local, this); - - if (local->fd && local->transaction.eager_lock_on) - afr_set_lk_owner (frame, this, local->fd); - else - afr_set_lk_owner (frame, this, frame->root); - - if (!local->transaction.eager_lock_on && local->loc.inode) { - fd = fd_lookup (local->loc.inode, frame->root->pid); - if (fd == NULL) - fd = fd_lookup_anonymous (local->loc.inode); - - if (fd) { - afr_delayed_changelog_wake_up (this, fd); - fd_unref (fd); - } - } - - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - afr_internal_lock_finish (frame, this); - } else { - afr_lock (frame, this); - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int ret = -1; + int event_generation = 0; + + local = frame->local; + priv = this->private; + local->transaction.frame = frame; + + local->transaction.type = type; + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + ret = -afr_quorum_errno(priv); + goto out; + } + + if (!afr_is_consistent_io_possible(local, priv, &ret)) { + ret = -ret; /*op_errno to ret conversion*/ + goto out; + } + + if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { + ret = -afr_quorum_errno(priv); + goto out; + } + + ret = afr_transaction_local_init(local, this); + if (ret < 0) + goto out; + + ret = afr_transaction_lockee_init(frame); + if (ret) + goto out; + + if (type != AFR_METADATA_TRANSACTION) { + goto txn_start; + } + + ret = afr_inode_get_readable(frame, local->inode, this, local->readable, + &event_generation, type); + if (ret < 0 || + afr_is_inode_refresh_reqd(local->inode, this, priv->event_generation, + event_generation)) { + afr_inode_refresh(frame, this, local->inode, local->loc.gfid, + afr_write_txn_refresh_done); ret = 0; + goto out; + } + +txn_start: + ret = 0; + afr_transaction_start(local, this); out: - return ret; + return ret; } diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index fa626fd0d6e..beefa26f4a6 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -11,41 +11,65 @@ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ -typedef enum { - LOCAL_FIRST = 1, - LOCAL_LAST = 2 -} afr_xattrop_type_t; +#include "afr.h" void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, - int child_index); +afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, + int child_index); + +int32_t +afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type); int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int32_t **pending); + +void +afr_delayed_changelog_wake_up(xlator_t *this, fd_t *fd); -afr_inodelk_t* -afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); +void +__mark_all_success(call_frame_t *frame, xlator_t *this); -int32_t -afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); +gf_boolean_t +afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this); -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this); int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - int child, afr_xattrop_type_t op); -void -afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); +afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type); + +int +afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol); void -afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); +afr_pending_read_increment(afr_private_t *priv, int child_index); void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type); -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); +afr_pending_read_decrement(afr_private_t *priv, int child_index); +call_frame_t * +afr_transaction_detach_fop_frame(call_frame_t *frame); +gf_boolean_t +afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame); gf_boolean_t -afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); +afr_needs_changelog_update(afr_local_t *local); +void +afr_zero_fill_stat(afr_local_t *local); + +void +afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1, + unsigned char *readable1, inode_t *inode2, + unsigned char *readable2); +int +afr_transaction_resume(call_frame_t *frame, xlator_t *this); + +int +afr_lock(call_frame_t *frame, xlator_t *this); + +void +afr_delayed_changelog_wake_up_cbk(void *data); + +int +afr_release_notify_lock_for_ta(void *opaque); + +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 4338b799aa7..df7366f0a65 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -15,772 +15,1330 @@ #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif #include "afr-common.c" - -#define SHD_INODE_LRU_LIMIT 2048 -#define AFR_EH_HEALED_LIMIT 1024 -#define AFR_EH_HEAL_FAIL_LIMIT 1024 -#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 +#include "afr-messages.h" struct volume_options options[]; +static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = { + [AFR_FAV_CHILD_NONE] = "none", + [AFR_FAV_CHILD_BY_SIZE] = "size", + [AFR_FAV_CHILD_BY_CTIME] = "ctime", + [AFR_FAV_CHILD_BY_MTIME] = "mtime", + [AFR_FAV_CHILD_BY_MAJORITY] = "majority", + [AFR_FAV_CHILD_POLICY_MAX] = NULL, +}; + int32_t -notify (xlator_t *this, int32_t event, - void *data, ...) +notify(xlator_t *this, int32_t event, void *data, ...) { - int ret = -1; - va_list ap; - void *data2 = NULL; + int ret = -1; + va_list ap; + void *data2 = NULL; - va_start (ap, data); - data2 = va_arg (ap, dict_t*); - va_end (ap); - ret = afr_notify (this, event, data, data2); + va_start(ap, data); + data2 = va_arg(ap, dict_t *); + va_end(ap); + ret = afr_notify(this, event, data, data2); - return ret; + return ret; } int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); + if (!this) + return ret; - if (ret != 0) { - gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } + ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1); + if (ret != 0) { return ret; -} + } + return ret; +} int -xlator_subvolume_index (xlator_t *this, xlator_t *subvol) +xlator_subvolume_index(xlator_t *this, xlator_t *subvol) { - int index = -1; - int i = 0; - xlator_list_t *list = NULL; - - list = this->children; - - while (list) { - if (subvol == list->xlator || - strcmp (subvol->name, list->xlator->name) == 0) { - index = i; - break; - } - list = list->next; - i++; + int index = -1; + int i = 0; + xlator_list_t *list = NULL; + + list = this->children; + + while (list) { + if (subvol == list->xlator || + strcmp(subvol->name, list->xlator->name) == 0) { + index = i; + break; } + list = list->next; + i++; + } - return index; + return index; } -void -fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype) +static void +fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype, + dict_t *options) { - if (priv->quorum_count && strcmp(qtype,"fixed")) { - gf_log(this->name,GF_LOG_WARNING, - "quorum-type %s overriding quorum-count %u", - qtype, priv->quorum_count); - } - if (!strcmp(qtype,"none")) { - priv->quorum_count = 0; - } - else if (!strcmp(qtype,"auto")) { - priv->quorum_count = AFR_QUORUM_AUTO; - } + if (dict_get_sizen(options, "quorum-type") == NULL) { + /* If user doesn't configure anything enable auto-quorum if the + * replica has more than two subvolumes */ + if (priv->child_count > 2) + qtype = "auto"; + } + + if (priv->quorum_count && strcmp(qtype, "fixed")) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE, + "quorum-type %s overriding quorum-count %u", qtype, + priv->quorum_count); + } + + if (!strcmp(qtype, "none")) { + priv->quorum_count = 0; + } else if (!strcmp(qtype, "auto")) { + priv->quorum_count = AFR_QUORUM_AUTO; + } } int -reconfigure (xlator_t *this, dict_t *options) +afr_set_favorite_child_policy(afr_private_t *priv, char *policy) { - afr_private_t *priv = NULL; - xlator_t *read_subvol = NULL; - int read_subvol_index = -1; - int ret = -1; - int index = -1; - char *qtype = NULL; + int index = -1; - priv = this->private; + index = gf_get_index_by_elem(afr_favorite_child_policies, policy); + if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX) + return -1; - GF_OPTION_RECONF ("background-self-heal-count", - priv->background_self_heal_count, options, uint32, - out); + priv->fav_child_policy = index; - GF_OPTION_RECONF ("metadata-self-heal", - priv->metadata_self_heal, options, bool, out); + return 0; +} - GF_OPTION_RECONF ("data-self-heal", priv->data_self_heal, options, str, - out); +static void +set_data_self_heal_algorithm(afr_private_t *priv, char *algo) +{ + if (!algo) { + priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC; + } else if (strcmp(algo, "full") == 0) { + priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL; + } else if (strcmp(algo, "diff") == 0) { + priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF; + } else { + priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC; + } +} - GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options, - bool, out); +void +afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options) +{ + char *volfile_id_str = NULL; + uuid_t anon_inode_gfid = {0}; + + /*If volume id is not present don't enable anything*/ + if (dict_get_str(options, "volume-id", &volfile_id_str)) + return; + GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX); + /*anon_inode_name is not supposed to change once assigned*/ + if (!priv->anon_inode_name[0]) { + snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s", + AFR_ANON_DIR_PREFIX, volfile_id_str); + gf_uuid_parse(volfile_id_str, anon_inode_gfid); + /*Flip a bit to make sure volfile-id and anon-gfid are not same*/ + anon_inode_gfid[0] ^= 1; + uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str); + } +} - GF_OPTION_RECONF ("strict-readdir", priv->strict_readdir, options, bool, - out); +int +reconfigure(xlator_t *this, dict_t *options) +{ + afr_private_t *priv = NULL; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + int timeout_old = 0; + int ret = -1; + int index = -1; + char *qtype = NULL; + char *fav_child_policy = NULL; + char *data_self_heal = NULL; + char *data_self_heal_algorithm = NULL; + char *locking_scheme = NULL; + gf_boolean_t consistent_io = _gf_false; + gf_boolean_t choose_local_old = _gf_false; + gf_boolean_t enabled_old = _gf_false; - GF_OPTION_RECONF ("data-self-heal-window-size", - priv->data_self_heal_window_size, options, - uint32, out); + priv = this->private; - GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options, - bool, out); + GF_OPTION_RECONF("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, options, bool, out); - GF_OPTION_RECONF ("metadata-change-log", - priv->metadata_change_log, options, bool, out); + GF_OPTION_RECONF("background-self-heal-count", + priv->background_self_heal_count, options, uint32, out); - GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options, - bool, out); + GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options, + uint32, out); - GF_OPTION_RECONF ("data-self-heal-algorithm", - priv->data_self_heal_algorithm, options, str, out); + GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options, + bool, out); - GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, bool, out); + GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out); + if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1) + goto out; - GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); + GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool, + out); - GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, - options, uint32, out); + GF_OPTION_RECONF("data-self-heal-window-size", + priv->data_self_heal_window_size, options, uint32, out); - if (read_subvol) { - index = xlator_subvolume_index (this, read_subvol); - if (index == -1) { - gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", - read_subvol->name); - goto out; - } - priv->read_child = index; - } + GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm, + options, str, out); + set_data_self_heal_algorithm(priv, data_self_heal_algorithm); - GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out); + GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out); - if (read_subvol_index >-1) { - index=read_subvol_index; - if (index >= priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", - index); - goto out; - } - priv->read_child = index; - } + GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec, + options, uint32, out); - GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); - GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); - GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, - uint32, out); - fix_quorum_options(this,priv,qtype); - GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, - int32, out); - - GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, - uint32, out); - - GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, - options, size, out); - /* Reset this so we re-discover in case the topology changed. */ - GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, - bool, out); - GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, - bool, out); - priv->did_discovery = _gf_false; + GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec, + options, uint32, out); - ret = 0; -out: - return ret; + GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options, + uint32, out); -} + GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options, + uint32, out); + GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options, + uint32, out); -static const char *favorite_child_warning_str = "You have specified subvolume '%s' " - "as the 'favorite child'. This means that if a discrepancy in the content " - "or attributes (ownership, permission, etc.) of a file is detected among " - "the subvolumes, the file on '%s' will be considered the definitive " - "version and its contents will OVERWRITE the contents of the file on other " - "subvolumes. All versions of the file except that on '%s' " - "WILL BE LOST."; + GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out); + choose_local_old = priv->choose_local; + GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out); -int32_t -init (xlator_t *this) -{ - afr_private_t *priv = NULL; - int child_count = 0; - xlator_list_t *trav = NULL; - int i = 0; - int ret = -1; - GF_UNUSED int op_errno = 0; - xlator_t *read_subvol = NULL; - int read_subvol_index = -1; - xlator_t *fav_child = NULL; - char *qtype = NULL; - - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "replicate translator needs more than one " - "subvolume defined."); - return -1; + if (choose_local_old != priv->choose_local) { + priv->read_child = -1; + if (choose_local_old == _gf_false) + priv->did_discovery = _gf_false; + } + + GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out); + + if (read_subvol) { + index = xlator_subvolume_index(this, read_subvol); + if (index == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "%s not a subvolume", read_subvol->name); + goto out; } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling."); + priv->read_child = index; + } + + GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32, + out); + + if (read_subvol_index > -1) { + index = read_subvol_index; + if (index >= priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "%d not a subvolume-index", index); + goto out; } + priv->read_child = index; + } + + GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out); + GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out); + priv->granular_locks = (strcmp(locking_scheme, "granular") == 0); + GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out); + GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool, + out); + + GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log, + options, bool, out); + GF_OPTION_RECONF("quorum-type", qtype, options, str, out); + GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out); + fix_quorum_options(this, priv, qtype, options); + if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL)) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL, + "Client-quorum is not met"); + + GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options, + uint32, out); + + GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options, + size_uint64, out); + /* Reset this so we re-discover in case the topology changed. */ + GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options, + bool, out); + + enabled_old = priv->shd.enabled; + GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out); + + GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool, + out); + + timeout_old = priv->shd.timeout; + GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out); + + GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options, + bool, out); + + GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32, + out); + + GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options, + uint32, out); + + GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str, + out); + if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) + goto out; + + priv->did_discovery = _gf_false; + + GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out); + if (priv->quorum_count != 0) + consistent_io = _gf_false; + priv->consistent_io = consistent_io; + + afr_handle_anon_inode_options(priv, options); + + GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool, + out); + if (priv->shd.enabled) { + if ((priv->shd.enabled != enabled_old) || + (timeout_old != priv->shd.timeout)) + afr_selfheal_childup(this, priv); + } + + ret = 0; +out: + return ret; +} - this->private = GF_CALLOC (1, sizeof (afr_private_t), - gf_afr_mt_afr_private_t); - if (!this->private) - goto out; +static int +afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this) +{ + int ret = -1; + int i = 0; + char *ptr = NULL; + char *ptr1 = NULL; + char *xattrs_list = NULL; + xlator_list_t *trav = NULL; + int child_count = -1; + + trav = this->children; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the + * name of the thin arbiter file for persistence across add/ + * removal of DHT subvols.*/ + child_count++; + } + + GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out); + priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count, + gf_afr_mt_char); + if (!priv->pending_key) { + ret = -ENOMEM; + goto out; + } + if (!xattrs_list) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG, + "Unable to fetch afr-pending-xattr option from volfile." + " Falling back to using client translator names. "); - priv = this->private; - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - //lock recovery is not done in afr - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); + while (i < child_count) { + ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, + trav->xlator->name); + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + trav = trav->next; + i++; + } + ret = 0; + goto out; + } - child_count = xlator_subvolume_count (this); + ptr = ptr1 = gf_strdup(xattrs_list); + if (!ptr) { + ret = -ENOMEM; + goto out; + } + for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) { + ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, + ptr); + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + i++; + } + ret = 0; - priv->child_count = child_count; +out: + GF_FREE(ptr1); + return ret; +} - priv->read_child = -1; +void +afr_ta_init(afr_private_t *priv) +{ + priv->thin_arbiter_count = 1; + priv->child_count--; + priv->ta_child_up = 0; + priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; + priv->ta_notify_dom_lock_offset = 0; + priv->ta_in_mem_txn_count = 0; + priv->ta_on_wire_txn_count = 0; + priv->release_ta_notify_dom_lock = _gf_false; + INIT_LIST_HEAD(&priv->ta_waitq); + INIT_LIST_HEAD(&priv->ta_onwireq); + gf_uuid_clear(priv->ta_gfid); +} - GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); - if (read_subvol) { - priv->read_child = xlator_subvolume_index (this, read_subvol); - if (priv->read_child == -1) { - gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", - read_subvol->name); - goto out; - } - } - GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out); - if (read_subvol_index > -1) { - if (read_subvol_index >= priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", - read_subvol_index); - goto out; - } - priv->read_child = read_subvol_index; +int32_t +init(xlator_t *this) +{ + afr_private_t *priv = NULL; + int child_count = 0; + xlator_list_t *trav = NULL; + int i = 0; + int ret = -1; + GF_UNUSED int op_errno = 0; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + char *qtype = NULL; + char *fav_child_policy = NULL; + char *thin_arbiter = NULL; + char *data_self_heal = NULL; + char *locking_scheme = NULL; + char *data_self_heal_algorithm = NULL; + + if (!this->children) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED, + "replicate translator needs more than one " + "subvolume defined."); + return -1; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED, + "Volume is dangling."); + } + + this->private = GF_CALLOC(1, sizeof(afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; + + priv = this->private; + INIT_LIST_HEAD(&priv->saved_locks); + INIT_LIST_HEAD(&priv->lk_healq); + LOCK_INIT(&priv->lock); + + child_count = xlator_subvolume_count(this); + + priv->child_count = child_count; + + priv->read_child = -1; + + GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out); + GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out); + if (thin_arbiter && strlen(thin_arbiter) > 0) { + afr_ta_init(priv); + } + INIT_LIST_HEAD(&priv->healing); + INIT_LIST_HEAD(&priv->heal_waiting); + + priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT; + + GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out); + + GF_OPTION_INIT("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, bool, out); + + GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out); + if (read_subvol) { + priv->read_child = xlator_subvolume_index(this, read_subvol); + if (priv->read_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "%s not a subvolume", read_subvol->name); + goto out; } - GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); - - GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); - - priv->favorite_child = -1; - GF_OPTION_INIT ("favorite-child", fav_child, xlator, out); - if (fav_child) { - priv->favorite_child = xlator_subvolume_index (this, fav_child); - if (priv->favorite_child == -1) { - gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", - fav_child->name); - goto out; - } - gf_log (this->name, GF_LOG_WARNING, - favorite_child_warning_str, fav_child->name, - fav_child->name, fav_child->name); + } + GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out); + if (read_subvol_index > -1) { + if (read_subvol_index >= priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "%d not a subvolume-index", read_subvol_index); + goto out; } + priv->read_child = read_subvol_index; + } + GF_OPTION_INIT("choose-local", priv->choose_local, bool, out); + priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads), + priv->child_count, gf_afr_mt_atomic_t); - GF_OPTION_INIT ("background-self-heal-count", - priv->background_self_heal_count, uint32, out); + GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out); - GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out); + priv->favorite_child = -1; - GF_OPTION_INIT ("data-self-heal-algorithm", - priv->data_self_heal_algorithm, str, out); + GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out); + if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) + goto out; - GF_OPTION_INIT ("data-self-heal-window-size", - priv->data_self_heal_window_size, uint32, out); + GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out); - GF_OPTION_INIT ("metadata-self-heal", priv->metadata_self_heal, bool, - out); + GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out); - GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); + GF_OPTION_INIT("background-self-heal-count", + priv->background_self_heal_count, uint32, out); - GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); + GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out); - GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + GF_OPTION_INIT("data-self-heal", data_self_heal, str, out); + if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1) + goto out; - GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); + GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str, + out); + set_data_self_heal_algorithm(priv, data_self_heal_algorithm); - GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, - out); + GF_OPTION_INIT("data-self-heal-window-size", + priv->data_self_heal_window_size, uint32, out); - GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out); + GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out); - GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log, - bool, out); + GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out); - GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out); + GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec, + uint32, out); - GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); + GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32, + out); + GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out); + GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out); - GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); + GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out); - GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); - GF_OPTION_INIT ("quorum-type", qtype, str, out); - GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out); - GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size, - out); - fix_quorum_options(this,priv,qtype); + GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec, + uint32, out); - GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); - GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); - GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, - out); + GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); - priv->wait_count = 1; + GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool, + out); - priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, - gf_afr_mt_char); - if (!priv->child_up) { - ret = -ENOMEM; - goto out; - } + GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out); + GF_OPTION_INIT("locking-scheme", locking_scheme, str, out); + priv->granular_locks = (strcmp(locking_scheme, "granular") == 0); + GF_OPTION_INIT("full-lock", priv->full_lock, bool, out); + GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out); - for (i = 0; i < child_count; i++) - priv->child_up[i] = -1; /* start with unknown state. - this initialization needed - for afr_notify() to work - reliably - */ + GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out); + GF_OPTION_INIT("quorum-type", qtype, str, out); + GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out); + GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64, + out); + fix_quorum_options(this, priv, qtype, this->options); - priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, - gf_afr_mt_xlator_t); - if (!priv->children) { - ret = -ENOMEM; - goto out; - } + GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); + GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out); - priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), - child_count, - gf_afr_mt_char); - if (!priv->pending_key) { - ret = -ENOMEM; - goto out; - } + GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out); - trav = this->children; - i = 0; - while (i < child_count) { - priv->children[i] = trav->xlator; - - ret = gf_asprintf (&priv->pending_key[i], "%s.%s", - AFR_XATTR_PREFIX, - trav->xlator->name); - if (-1 == ret) { - ret = -ENOMEM; - goto out; - } - - trav = trav->next; - i++; - } + GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out); - ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); - if (-1 == ret) { - ret = -ENOMEM; - goto out; - } + GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out); + GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out); + afr_handle_anon_inode_options(priv, this->options); - priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), - gf_afr_mt_int32_t); - if (!priv->last_event) { - ret = -ENOMEM; - goto out; - } + GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out); + if (priv->quorum_count != 0) + priv->consistent_io = _gf_false; - /* keep more local here as we may need them for self-heal etc */ - this->local_pool = mem_pool_new (afr_local_t, 512); - if (!this->local_pool) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto out; - } + priv->wait_count = 1; - priv->first_lookup = 1; - priv->root_inode = NULL; + priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); + if (!priv->local) { + ret = -ENOMEM; + goto out; + } - if (!priv->shd.iamshd) { - ret = 0; - goto out; - } + priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); - ret = -ENOMEM; - priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, - gf_afr_mt_brick_pos_t); - if (!priv->shd.pos) - goto out; + priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); - priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count, - gf_afr_mt_int32_t); - if (!priv->shd.pending) - goto out; + priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count, + gf_afr_mt_child_latency_t); + priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); - priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress), - child_count, gf_afr_mt_shd_bool_t); - if (!priv->shd.inprogress) - goto out; - priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count, - gf_afr_mt_shd_timer_t); - if (!priv->shd.timer) - goto out; + if (!priv->child_up || !priv->child_latency || !priv->halo_child_up || + !priv->anon_inode) { + ret = -ENOMEM; + goto out; + } + /*Initialize to -ve ping timeout so that they are not considered + * in child-up events until ping-event comes*/ + for (i = 0; i < child_count; i++) + priv->child_latency[i] = -1; + + priv->children = GF_CALLOC(sizeof(xlator_t *), child_count, + gf_afr_mt_xlator_t); + if (!priv->children) { + ret = -ENOMEM; + goto out; + } + + ret = afr_pending_xattrs_init(priv, this); + if (ret) + goto out; + + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; + trav = trav->next; + i++; + } + + ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } - priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false); - if (!priv->shd.healed) - goto out; + priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event), + gf_afr_mt_int32_t); + if (!priv->last_event) { + ret = -ENOMEM; + goto out; + } - priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false); - if (!priv->shd.heal_failed) - goto out; + this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this); + if (!this->itable) { + ret = -ENOMEM; + goto out; + } + + if (priv->shd.iamshd) { + ret = afr_selfheal_daemon_init(this); + if (ret) { + ret = -ENOMEM; + goto out; + } + } - priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false); - if (!priv->shd.split_brain) - goto out; + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new(afr_local_t, 512); + if (!this->local_pool) { + ret = -1; + goto out; + } - this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); - if (!this->itable) - goto out; - priv->root_inode = inode_ref (this->itable->root); - GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); - GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); + priv->root_inode = NULL; - ret = 0; + ret = 0; out: - return ret; + return ret; +} +void +afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer) +{ + int ret = -1; + + if (!healer) + return; + + if (healer->running) { + /* + * If there are any resources to cleanup, We need + * to do that gracefully using pthread_cleanup_push + */ + ret = gf_thread_cleanup_xint(healer->thread); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED, + "Failed to clean up healer threads."); + healer->thread = 0; + } + pthread_cond_destroy(&healer->cond); + pthread_mutex_destroy(&healer->mutex); } - -int -fini (xlator_t *this) +void +afr_selfheal_daemon_fini(xlator_t *this) +{ + struct subvol_healer *healer = NULL; + afr_self_heald_t *shd = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + if (!priv) + return; + + shd = &priv->shd; + if (!shd->iamshd) + return; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + afr_destroy_healer_object(this, healer); + + healer = &shd->full_healers[i]; + afr_destroy_healer_object(this, healer); + + if (shd->statistics[i]) + eh_destroy(shd->statistics[i]); + } + GF_FREE(shd->index_healers); + GF_FREE(shd->full_healers); + GF_FREE(shd->statistics); + if (shd->split_brain) + eh_destroy(shd->split_brain); +} +void +fini(xlator_t *this) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; - priv = this->private; - this->private = NULL; - afr_priv_destroy (priv); - if (this->itable);//I dont see any destroy func + priv = this->private; - return 0; -} + afr_selfheal_daemon_fini(this); + GF_ASSERT(list_empty(&priv->saved_locks)); + + LOCK(&priv->lock); + if (priv->timer != NULL) { + gf_timer_call_cancel(this->ctx, priv->timer); + priv->timer = NULL; + } + UNLOCK(&priv->lock); + + if (this->local_pool != NULL) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + this->private = NULL; + afr_priv_destroy(priv); + if (this->itable) { + inode_table_destroy(this->itable); + this->itable = NULL; + } + + return; +} struct xlator_fops fops = { - .lookup = afr_lookup, - .open = afr_open, - .lk = afr_lk, - .flush = afr_flush, - .statfs = afr_statfs, - .fsync = afr_fsync, - .fsyncdir = afr_fsyncdir, - .xattrop = afr_xattrop, - .fxattrop = afr_fxattrop, - .inodelk = afr_inodelk, - .finodelk = afr_finodelk, - .entrylk = afr_entrylk, - .fentrylk = afr_fentrylk, - .fallocate = afr_fallocate, - .discard = afr_discard, - - /* inode read */ - .access = afr_access, - .stat = afr_stat, - .fstat = afr_fstat, - .readlink = afr_readlink, - .getxattr = afr_getxattr, - .fgetxattr = afr_fgetxattr, - .readv = afr_readv, - - /* inode write */ - .writev = afr_writev, - .truncate = afr_truncate, - .ftruncate = afr_ftruncate, - .setxattr = afr_setxattr, - .fsetxattr = afr_fsetxattr, - .setattr = afr_setattr, - .fsetattr = afr_fsetattr, - .removexattr = afr_removexattr, - .fremovexattr = afr_fremovexattr, - - /* dir read */ - .opendir = afr_opendir, - .readdir = afr_readdir, - .readdirp = afr_readdirp, - - /* dir write */ - .create = afr_create, - .mknod = afr_mknod, - .mkdir = afr_mkdir, - .unlink = afr_unlink, - .rmdir = afr_rmdir, - .link = afr_link, - .symlink = afr_symlink, - .rename = afr_rename, + .lookup = afr_lookup, + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, + .fsyncdir = afr_fsyncdir, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, + .entrylk = afr_entrylk, + .fentrylk = afr_fentrylk, + .ipc = afr_ipc, + .lease = afr_lease, + + /* inode read */ + .access = afr_access, + .stat = afr_stat, + .fstat = afr_fstat, + .readlink = afr_readlink, + .getxattr = afr_getxattr, + .fgetxattr = afr_fgetxattr, + .readv = afr_readv, + .seek = afr_seek, + + /* inode write */ + .writev = afr_writev, + .truncate = afr_truncate, + .ftruncate = afr_ftruncate, + .setxattr = afr_setxattr, + .fsetxattr = afr_fsetxattr, + .setattr = afr_setattr, + .fsetattr = afr_fsetattr, + .removexattr = afr_removexattr, + .fremovexattr = afr_fremovexattr, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, + .fsync = afr_fsync, + + /*inode open*/ + .opendir = afr_opendir, + .open = afr_open, + + /* dir read */ + .readdir = afr_readdir, + .readdirp = afr_readdirp, + + /* dir write */ + .create = afr_create, + .mknod = afr_mknod, + .mkdir = afr_mkdir, + .unlink = afr_unlink, + .rmdir = afr_rmdir, + .link = afr_link, + .symlink = afr_symlink, + .rename = afr_rename, }; - struct xlator_dumpops dumpops = { - .priv = afr_priv_dump, + .priv = afr_priv_dump, }; - struct xlator_cbks cbks = { - .release = afr_release, - .releasedir = afr_releasedir, - .forget = afr_forget, + .release = afr_release, + .releasedir = afr_releasedir, + .forget = afr_forget, }; - struct volume_options options[] = { - { .key = {"read-subvolume" }, - .type = GF_OPTION_TYPE_XLATOR, - .description = "inode-read fops happen only on one of the bricks in " - "replicate. Afr will prefer the one specified using " - "this option if it is not stale. Option value must be " - "one of the xlator names of the children. " - "Ex: <volname>-client-0 till " - "<volname>-client-<number-of-bricks - 1>" - }, - { .key = {"read-subvolume-index" }, - .type = GF_OPTION_TYPE_INT, - .default_value = "-1", - .description = "inode-read fops happen only on one of the bricks in " - "replicate. AFR will prefer the one specified using " - "this option if it is not stale. allowed options" - " include -1 till replica-count - 1" - }, - { .key = {"read-hash-mode" }, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = 2, - .default_value = "0", - .description = "inode-read fops happen only on one of the bricks in " - "replicate. AFR will prefer the one computed using " - "the method specified using this option" - "0 = first responder, " - "1 = hash by GFID of file (all clients use " - "same subvolume), " - "2 = hash by GFID of file and client PID", - }, - { .key = {"choose-local" }, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "true", - .description = "Choose a local subvolume(i.e. Brick) to read from if " - "read-subvolume is not explicitly set.", - }, - { .key = {"favorite-child"}, - .type = GF_OPTION_TYPE_XLATOR, - .description = "If a split-brain happens choose subvol/brick set by " - "this option as source." - }, - { .key = {"background-self-heal-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .default_value = "16", - .validate = GF_OPT_VALIDATE_MIN, - .description = "This specifies the number of self-heals that can be " - " performed in background without blocking the fop" - }, - { .key = {"data-self-heal"}, - .type = GF_OPTION_TYPE_STR, - .value = {"1", "on", "yes", "true", "enable", - "0", "off", "no", "false", "disable", - "open"}, - .default_value = "on", - .description = "Using this option we can enable/disable data " - "self-heal on the file. \"open\" means data " - "self-heal action will only be triggered by file " - "open operations." - }, - { .key = {"data-self-heal-algorithm"}, - .type = GF_OPTION_TYPE_STR, - .description = "Select between \"full\", \"diff\". The " - "\"full\" algorithm copies the entire file from " - "source to sink. The \"diff\" algorithm copies to " - "sink only those blocks whose checksums don't match " - "with those of source. If no option is configured " - "the option is chosen dynamically as follows: " - "If the file does not exist on one of the sinks " - "or empty file exists or if the source file size is " - "about the same as page size the entire file will " - "be read and written i.e \"full\" algo, " - "otherwise \"diff\" algo is chosen.", - .value = { "diff", "full"} - }, - { .key = {"data-self-heal-window-size"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 1024, - .default_value = "1", - .description = "Maximum number blocks per file for which self-heal " - "process would be applied simultaneously." - }, - { .key = {"metadata-self-heal"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "Using this option we can enable/disable metadata " - "i.e. Permissions, ownerships, xattrs self-heal on " - "the file/directory." - }, - { .key = {"entry-self-heal"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "Using this option we can enable/disable entry " - "self-heal on the directory." - }, - { .key = {"data-change-log"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "Data fops like write/truncate will not perform " - "pre/post fop changelog operations in afr transaction " - "if this option is disabled" - }, - { .key = {"metadata-change-log"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "Metadata fops like setattr/setxattr will not perform " - "pre/post fop changelog operations in afr transaction " - "if this option is disabled" - }, - { .key = {"entry-change-log"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "Entry fops like create/unlink will not perform " - "pre/post fop changelog operations in afr transaction " - "if this option is disabled" - }, - { .key = {"optimistic-change-log"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "Entry/Metadata fops will not perform " - "pre fop changelog operations in afr transaction " - "if this option is enabled." - }, - { .key = {"strict-readdir"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"inodelk-trace"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "Enabling this option logs inode lock/unlocks" - }, - { .key = {"entrylk-trace"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "Enabling this option logs entry lock/unlocks" - }, - { .key = {"eager-lock"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "Lock phase of a transaction has two sub-phases. " - "First is an attempt to acquire locks in parallel by " - "broadcasting non-blocking lock requests. If lock " - "aquistion fails on any server, then the held locks " - "are unlocked and revert to a blocking locked mode " - "sequentially on one server after another. If this " - "option is enabled the initial broadcasting lock " - "request attempt to acquire lock on the entire file. " - "If this fails, we revert back to the sequential " - "\"regional\" blocking lock as before. In the case " - "where such an \"eager\" lock is granted in the " - "non-blocking phase, it gives rise to an opportunity " - "for optimization. i.e, if the next write transaction " - "on the same FD arrives before the unlock phase of " - "the first transaction, it \"takes over\" the full " - "file lock. Similarly if yet another data transaction " - "arrives before the unlock phase of the \"optimized\" " - "transaction, that in turn \"takes over\" the lock as " - "well. The actual unlock now happens at the end of " - "the last \"optimzed\" transaction." - - }, - { .key = {"self-heal-daemon"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "This option applies to only self-heal-daemon. " - "Index directory crawl and automatic healing of files" - "will not be performed if this option is turned off." - }, - { .key = {"iam-self-heal-daemon"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "This option differentiates if the replicate " - "translator is running as part of self-heal-daemon " - "or not." - }, - { .key = {"quorum-type"}, - .type = GF_OPTION_TYPE_STR, - .value = { "none", "auto", "fixed"}, - .default_value = "none", - .description = "If value is \"fixed\" only allow writes if " - "quorum-count bricks are present. If value is " - "\"auto\" only allow writes if more than half of " - "bricks, or exactly half including the first, are " - "present.", - }, - { .key = {"quorum-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = INT_MAX, - .default_value = 0, - .description = "If quorum-type is \"fixed\" only allow writes if " - "this many bricks or present. Other quorum types " - "will OVERWRITE this value.", - }, - { .key = {"node-uuid"}, - .type = GF_OPTION_TYPE_STR, - .description = "Local glusterd uuid string, used in starting " - "self-heal-daemon so that it can crawl only on " - "local index directories.", - }, - { .key = {"heal-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 60, - .max = INT_MAX, - .default_value = "600", - .description = "time interval for checking the need to self-heal " - "in self-heal-daemon" - }, - { .key = {"post-op-delay-secs"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = INT_MAX, - .default_value = "1", - .description = "Time interval induced artificially before " - "post-operation phase of the transaction to " - "enhance overlap of adjacent write operations.", - }, - { .key = {AFR_SH_READDIR_SIZE_KEY}, - .type = GF_OPTION_TYPE_SIZET, - .description = "readdirp size for performing entry self-heal", - .min = 1024, - .max = 131072, - .default_value = "1KB", - }, - { .key = {"readdir-failover"}, - .type = GF_OPTION_TYPE_BOOL, - .description = "readdir(p) will not failover if this option is off", - .default_value = "on", - }, - { .key = {"ensure-durability"}, - .type = GF_OPTION_TYPE_BOOL, - .description = "Afr performs fsyncs for transactions if this " - "option is on to make sure the changelogs/data is " - "written to the disk", - .default_value = "on", - }, - { .key = {NULL} }, + {.key = {"read-subvolume"}, + .type = GF_OPTION_TYPE_XLATOR, + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. Afr will prefer the one specified using " + "this option if it is not stale. Option value must be " + "one of the xlator names of the children. " + "Ex: <volname>-client-0 till " + "<volname>-client-<number-of-bricks - 1>"}, + {.key = {"read-subvolume-index"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "-1", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one specified using " + "this option if it is not stale. allowed options" + " include -1 till replica-count - 1"}, + {.key = {"read-hash-mode"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 5, + .default_value = "1", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = + "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one computed using " + "the method specified using this option.\n" + "0 = first readable child of AFR, starting from 1st child.\n" + "1 = hash by GFID of file (all clients use " + "same subvolume).\n" + "2 = hash by GFID of file and client PID.\n" + "3 = brick having the least outstanding read requests.\n" + "4 = brick having the least network ping latency.\n" + "5 = Hybrid mode between 3 and 4, ie least value among " + "network-latency multiplied by outstanding-read-requests."}, + { + .key = {"choose-local"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Choose a local subvolume (i.e. Brick) to read from" + " if read-subvolume is not explicitly set.", + }, + {.key = {"background-self-heal-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 256, + .default_value = "8", + .validate = GF_OPT_VALIDATE_MIN, + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This specifies the number of per client self-heal " + "jobs that can perform parallel heals in the " + "background."}, + {.key = {"halo-shd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "Maximum latency for shd halo replication in msec."}, + {.key = {"halo-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "Enable Halo (geo) replication mode."}, + {.key = {"halo-nfsd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "Maximum latency for nfsd halo replication in msec."}, + {.key = {"halo-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = AFR_HALO_MAX_LATENCY, + .default_value = "5", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "Maximum latency for halo replication in msec."}, + {.key = {"halo-max-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "The maximum number of halo replicas; replicas" + " beyond this value will be written asynchronously" + "via the SHD."}, + {.key = {"halo-min-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "2", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "The minimmum number of halo replicas, before adding " + "out of region replicas."}, + {.key = {"heal-wait-queue-length"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/ + .default_value = "128", + .validate = GF_OPT_VALIDATE_MIN, + .op_version = {GD_OP_VERSION_3_7_10}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This specifies the number of heals that can be queued" + " for the parallel background self heal jobs."}, + {.key = {"data-self-heal"}, + .type = GF_OPTION_TYPE_STR, + .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false", + "disable", "open"}, + .default_value = "off", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Using this option we can enable/disable data " + "self-heal on the file. \"open\" means data " + "self-heal action will only be triggered by file " + "open operations."}, + {.key = {"data-self-heal-algorithm"}, + .type = GF_OPTION_TYPE_STR, + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Select between \"full\", \"diff\". The " + "\"full\" algorithm copies the entire file from " + "source to sink. The \"diff\" algorithm copies to " + "sink only those blocks whose checksums don't match " + "with those of source. If no option is configured " + "the option is chosen dynamically as follows: " + "If the file does not exist on one of the sinks " + "or empty file exists or if the source file size is " + "about the same as page size the entire file will " + "be read and written i.e \"full\" algo, " + "otherwise \"diff\" algo is chosen.", + .value = {"diff", "full"}}, + {.key = {"data-self-heal-window-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, + .default_value = "1", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Maximum number blocks per file for which self-heal " + "process would be applied simultaneously."}, + {.key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + /*.validate_fn = validate_replica*/ + .description = "Using this option we can enable/disable metadata " + "i.e. Permissions, ownerships, xattrs self-heal on " + "the file/directory."}, + {.key = {"entry-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + /*.validate_fn = validate_replica*/ + .description = "Using this option we can enable/disable entry " + "self-heal on the directory."}, + {.key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, + {.key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, + {.key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, + {.key = {"optimistic-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Entry/Metadata fops will not perform " + "pre fop changelog operations in afr transaction " + "if this option is enabled."}, + {.key = {"inodelk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs inode lock/unlocks"}, + {.key = {"entrylk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs entry lock/unlocks"}, + {.key = {"pre-op-compat"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Use separate pre-op xattrop() FOP rather than " + "overloading xdata of the OP"}, + {.key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = + "Enable/Disable eager lock for replica volume. " + "Lock phase of a transaction has two sub-phases. " + "First is an attempt to acquire locks in parallel by " + "broadcasting non-blocking lock requests. If lock " + "acquisition fails on any server, then the held locks " + "are unlocked and we revert to a blocking locks mode " + "sequentially on one server after another. If this " + "option is enabled the initial broadcasting lock " + "request attempts to acquire a full lock on the entire file. " + "If this fails, we revert back to the sequential " + "\"regional\" blocking locks as before. In the case " + "where such an \"eager\" lock is granted in the " + "non-blocking phase, it gives rise to an opportunity " + "for optimization. i.e, if the next write transaction " + "on the same FD arrives before the unlock phase of " + "the first transaction, it \"takes over\" the full " + "file lock. Similarly if yet another data transaction " + "arrives before the unlock phase of the \"optimized\" " + "transaction, that in turn \"takes over\" the lock as " + "well. The actual unlock now happens at the end of " + "the last \"optimized\" transaction." + + }, + {.key = {"self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + /*.validate_fn = validate_replica_heal_enable_disable*/ + .description = "This option applies to only self-heal-daemon. " + "Index directory crawl and automatic healing of files " + "will not be performed if this option is turned off."}, + {.key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of self-heal-daemon " + "or not."}, + {.key = {"iam-nfs-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of an NFS daemon " + "or not."}, + { + .key = {"quorum-type"}, + .type = GF_OPTION_TYPE_STR, + .value = {"none", "auto", "fixed"}, + .default_value = "none", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + /*.option = quorum-type*/ + .description = "If value is \"fixed\" only allow writes if " + "quorum-count bricks are present. If value is " + "\"auto\" only allow writes if more than half of " + "bricks, or exactly half including the first, are " + "present.", + }, + { + .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = INT_MAX, + .default_value = 0, + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + /*.option = quorum-count*/ + /*.validate_fn = validate_quorum_count*/ + .description = "If quorum-type is \"fixed\" only allow writes if " + "this many bricks are present. Other quorum types " + "will OVERWRITE this value.", + }, + { + .key = {"quorum-reads"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option has been removed. Reads are not allowed " + "if quorum is not met.", + }, + { + .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "Local glusterd uuid string, used in starting " + "self-heal-daemon so that it can crawl only on " + "local index directories.", + }, + { + .key = {"post-op-delay-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "1", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Time interval induced artificially before " + "post-operation phase of the transaction to " + "enhance overlap of adjacent write operations.", + }, + { + .key = {AFR_SH_READDIR_SIZE_KEY}, + .type = GF_OPTION_TYPE_SIZET, + .description = "readdirp size for performing entry self-heal", + .min = 1024, + .max = 131072, + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .default_value = "1KB", + }, + { + .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {3}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", + }, + { + .key = {"afr-dirty-xattr"}, + .type = GF_OPTION_TYPE_STR, + .default_value = AFR_DIRTY_DEFAULT, + }, + {.key = {"afr-pending-xattr"}, + .type = GF_OPTION_TYPE_STR, + .description = "Comma separated list of xattrs that are used to " + "capture information on pending heals."}, + { + .key = {"metadata-splitbrain-forced-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + {.key = {"heal-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 5, + .max = INT_MAX, + .default_value = "600", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "time interval for checking the need to self-heal " + "in self-heal-daemon"}, + { + .key = {"consistent-metadata"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "If this option is enabled, readdirp will force " + "lookups on those entries read whose read child is " + "not the same as that of the parent. This will " + "guarantee that all read operations on a file serve " + "attributes from the same subvol as long as it holds " + " a good copy of the file/dir.", + }, + {.key = {"arbiter-count"}, + .type = GF_OPTION_TYPE_INT, + .description = "subset of child_count. Has to be 0 or 1."}, + { + .key = {"thin-arbiter"}, + .type = GF_OPTION_TYPE_STR, + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "contains host:path of thin abriter brick", + }, + {.key = {"shd-max-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 64, + .default_value = "1", + .op_version = {GD_OP_VERSION_3_7_12}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Maximum number of parallel heals SHD can do per " + "local brick. This can substantially lower heal times" + ", but can also crush your bricks if you don't have " + "the storage hardware to support this."}, + { + .key = {"shd-wait-qlength"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 655536, + .default_value = "1024", + .op_version = {GD_OP_VERSION_3_7_12}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option can be used to control number of heals" + " that can wait in SHD per subvolume", + }, + { + .key = {"locking-scheme"}, + .type = GF_OPTION_TYPE_STR, + .value = {"full", "granular"}, + .default_value = "full", + .op_version = {GD_OP_VERSION_3_7_12}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "If this option is set to granular, self-heal will " + "stop being compatible with afr-v1, which helps afr " + "be more granular while self-healing", + }, + {.key = {"full-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .op_version = {GD_OP_VERSION_3_13_2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "If this option is disabled, then the IOs will take " + "range locks same as versions till 3.13.1."}, + { + .key = {"granular-entry-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "If this option is enabled, self-heal will resort to " + "granular way of recording changelogs and doing entry " + "self-heal.", + }, + { + .key = {"favorite-child-policy"}, + .type = GF_OPTION_TYPE_STR, + .value = {"none", "size", "ctime", "mtime", "majority"}, + .default_value = "none", + .op_version = {GD_OP_VERSION_3_7_12}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option can be used to automatically resolve " + "split-brains using various policies without user " + "intervention. \"size\" picks the file with the " + "biggest size as the source. \"ctime\" and \"mtime\" " + "pick the file with the latest ctime and mtime " + "respectively as the source. \"majority\" picks a file" + " with identical mtime and size in more than half the " + "number of bricks in the replica.", + }, + { + .key = {"consistent-io"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = "If this option is enabled, i/o will fail even if " + "one of the bricks is down in the replicas", + }, + {.key = {"use-compound-fops"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_3_8_4}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, + {.key = {"use-anonymous-inode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_8_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "Setting this option heals directory renames efficiently"}, + + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "replicate", + .category = GF_MAINTAINED, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index d49cb6ccb56..d62f9a9caf2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -8,1169 +8,1416 @@ cases as published by the Free Software Foundation. */ - #ifndef __AFR_H__ #define __AFR_H__ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "call-stub.h" -#include "compat-errno.h" +#include <glusterfs/call-stub.h> +#include <glusterfs/compat-errno.h> #include "afr-mem-types.h" -#include "afr-self-heal-algorithm.h" #include "libxlator.h" -#include "timer.h" +#include <glusterfs/timer.h> +#include <glusterfs/syncop.h> + +#include "afr-self-heald.h" +#include "afr-messages.h" -#define AFR_XATTR_PREFIX "trusted.afr" +#define SHD_INODE_LRU_LIMIT 1 #define AFR_PATHINFO_HEADER "REPLICATE:" #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *)(THIS->private))->afr_dirty) + +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ +#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/ + +#define ARBITER_BRICK_INDEX 2 +#define THIN_ARBITER_BRICK_INDEX 2 +#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify" +#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify" + +#define AFR_LK_HEAL_DOM "afr.lock-heal.domain" + +#define AFR_HALO_MAX_LATENCY 99999 +#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode" + +#define PFLAG_PENDING (1 << 0) +#define PFLAG_SBRAIN (1 << 1) + +typedef int (*afr_lock_cbk_t)(call_frame_t *frame, xlator_t *this); + +typedef int (*afr_read_txn_wind_t)(call_frame_t *frame, xlator_t *this, + int subvol); + +typedef int (*afr_inode_refresh_cbk_t)(call_frame_t *frame, xlator_t *this, + int err); + +typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this); + +#define AFR_COUNT(array, max) \ + ({ \ + int __i; \ + int __res = 0; \ + for (__i = 0; __i < max; __i++) \ + if (array[__i]) \ + __res++; \ + __res; \ + }) +#define AFR_INTERSECT(dst, src1, src2, max) \ + ({ \ + int __i; \ + for (__i = 0; __i < max; __i++) \ + dst[__i] = src1[__i] && src2[__i]; \ + }) +#define AFR_CMP(a1, a2, len) \ + ({ \ + int __cmp = 0; \ + int __i; \ + for (__i = 0; __i < len; __i++) \ + if (a1[__i] != a2[__i]) { \ + __cmp = 1; \ + break; \ + } \ + __cmp; \ + }) +#define AFR_IS_ARBITER_BRICK(priv, index) \ + ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX)) + +#define AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(ret, errnum) \ + do { \ + local->op_ret = ret; \ + local->op_errno = errnum; \ + if (local->op_errno == EIO) \ + gf_msg(this->name, GF_LOG_ERROR, local->op_errno, \ + AFR_MSG_SPLIT_BRAIN, \ + "Failing %s on gfid %s: " \ + "split-brain observed.", \ + gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \ + } while (0) + +#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \ + do { \ + afr_fd_ctx_t *__fd_ctx = NULL; \ + __fd_ctx = afr_fd_ctx_get(__fd, __this); \ + if (__fd_ctx && __fd_ctx->is_fd_bad) { \ + __error = EBADF; \ + goto __label; \ + } \ + } while (0) -#define AFR_LOCKEE_COUNT_MAX 3 -#define AFR_DOM_COUNT_MAX 3 - -struct _pump_private; - -typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int child, int32_t op_error, - int32_t op_errno); - -typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int32_t op_error, int32_t op_errno); -typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); - -typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); -typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno); +typedef enum { + AFR_READ_POLICY_FIRST_UP, + AFR_READ_POLICY_GFID_HASH, + AFR_READ_POLICY_GFID_PID_HASH, + AFR_READ_POLICY_LESS_LOAD, + AFR_READ_POLICY_LEAST_LATENCY, + AFR_READ_POLICY_LOAD_LATENCY_HYBRID, +} afr_read_hash_mode_t; typedef enum { - AFR_POS_UNKNOWN, - AFR_POS_LOCAL, - AFR_POS_REMOTE -} afr_child_pos_t; + AFR_FAV_CHILD_NONE, + AFR_FAV_CHILD_BY_SIZE, + AFR_FAV_CHILD_BY_CTIME, + AFR_FAV_CHILD_BY_MTIME, + AFR_FAV_CHILD_BY_MAJORITY, + AFR_FAV_CHILD_POLICY_MAX, +} afr_favorite_child_policy; typedef enum { - SPLIT_BRAIN = 1, - ALL_FOOLS = 2 -} afr_subvol_status_t; + AFR_SELFHEAL_DATA_FULL = 0, + AFR_SELFHEAL_DATA_DIFF, + AFR_SELFHEAL_DATA_DYNAMIC, +} afr_data_self_heal_type_t; typedef enum { - AFR_INODE_SET_READ_CTX = 1, - AFR_INODE_RM_STALE_CHILDREN, - AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_GET_READ_CTX, - AFR_INODE_GET_OPENDIR_DONE, -} afr_inode_op_t; - -typedef struct afr_inode_params_ { - afr_inode_op_t op; - union { - gf_boolean_t value; - struct { - int32_t read_child; - int32_t *children; - } read_ctx; - } u; -} afr_inode_params_t; - -typedef enum afr_spb_state { - DONT_KNOW, - SPB, - NO_SPB -} afr_spb_state_t; - -typedef struct afr_inode_ctx_ { - uint64_t masks; - int32_t *fresh_children;//increasing order of latency - afr_spb_state_t mdata_spb; - afr_spb_state_t data_spb; - uint32_t open_fd_count; -} afr_inode_ctx_t; + AFR_CHILD_UNKNOWN = -1, + AFR_CHILD_ZERO, + AFR_CHILD_ONE, + AFR_CHILD_THIN_ARBITER, +} afr_child_index; typedef enum { - NONE, - INDEX, - FULL, -} afr_crawl_type_t; - -typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t iamshd; - afr_crawl_type_t *pending; - gf_boolean_t *inprogress; - afr_child_pos_t *pos; - gf_timer_t **timer; - eh_t *healed; - eh_t *heal_failed; - eh_t *split_brain; - char *node_uuid; - int timeout; -} afr_self_heald_t; + TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall + notification and waiting for its release.*/ + TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get + *info about which brick is bad.*/ + TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed + *on BAD brick - Success*/ + TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed + *on GOOD brick - Failed*/ + TA_SUCCESS, /*FOP succeeded on both data bricks.*/ +} afr_ta_fop_state_t; + +struct afr_nfsd { + uint32_t halo_max_latency_msec; + gf_boolean_t iamnfsd; +}; + +typedef struct _afr_lk_heal_info { + fd_t *fd; + int32_t cmd; + struct gf_flock flock; + dict_t *xdata_req; + unsigned char *locked_nodes; + struct list_head pos; + gf_lkowner_t lk_owner; + pid_t pid; + int32_t *child_up_event_gen; + int32_t *child_down_event_gen; +} afr_lk_heal_info_t; typedef struct _afr_private { - gf_lock_t lock; /* to guard access to child_count, etc */ - unsigned int child_count; /* total number of children */ + gf_lock_t lock; /* to guard access to child_count, etc */ + unsigned int child_count; /* total number of children */ + unsigned int arbiter_count; /*subset of child_count. + Has to be 0 or 1.*/ + + xlator_t **children; + + inode_t *root_inode; + + int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ + /* For thin-arbiter. */ + uuid_t ta_gfid; + unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/ + int ta_bad_child_index; + int ta_event_gen; + unsigned int ta_in_mem_txn_count; + unsigned int ta_on_wire_txn_count; + struct list_head ta_waitq; + struct list_head ta_onwireq; + + unsigned char *anon_inode; + unsigned char *child_up; + unsigned char *halo_child_up; + int64_t *child_latency; + unsigned char *local; + + char **pending_key; + + afr_data_self_heal_type_t data_self_heal_algorithm; + unsigned int data_self_heal_window_size; /* max number of pipelined + read/writes */ + + struct list_head heal_waiting; /*queue for files that need heal*/ + uint32_t heal_wait_qlen; /*configurable queue length for heal_waiting*/ + int32_t heal_waiters; /* No. of elements currently in wait queue.*/ + + struct list_head healing; /* queue for files that are undergoing + background heal*/ + uint32_t background_self_heal_count; /*configurable queue length for + healing queue*/ + int32_t healers; /* No. of elements currently undergoing background + heal*/ + + gf_boolean_t release_ta_notify_dom_lock; + + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + int read_child; /* read-subvolume */ + gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ + + gf_timer_t *timer; /* launched when parent up is received */ + + unsigned int wait_count; /* # of servers to wait for success */ + + unsigned char ta_child_up; + gf_boolean_t optimistic_change_log; + gf_boolean_t eager_lock; + gf_boolean_t pre_op_compat; /* on/off */ + uint32_t post_op_delay_secs; + unsigned int quorum_count; + + off_t ta_notify_dom_lock_offset; + afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic + resolution of split-brains.*/ + afr_read_hash_mode_t hash_mode; /* for when read_child is not set */ + + int32_t *last_event; + + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; + char vol_uuid[UUID_SIZE + 1]; + + gf_boolean_t choose_local; + gf_boolean_t did_discovery; + gf_boolean_t ensure_durability; + gf_boolean_t halo_enabled; + gf_boolean_t consistent_metadata; + gf_boolean_t need_heal; + gf_boolean_t granular_locks; + uint64_t sh_readdir_size; + char *sh_domain; + char *afr_dirty; + + uint64_t spb_choice_timeout; + + afr_self_heald_t shd; + struct afr_nfsd nfsd; + + uint32_t halo_max_latency_msec; + uint32_t halo_max_replicas; + uint32_t halo_min_replicas; + + gf_boolean_t full_lock; + gf_boolean_t esh_granular; + gf_boolean_t consistent_io; + gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t use_anon_inode; + + /*For lock healing.*/ + struct list_head saved_locks; + struct list_head lk_healq; + + /*For anon-inode handling */ + char anon_inode_name[NAME_MAX + 1]; + char anon_gfid_str[UUID_SIZE + 1]; +} afr_private_t; - unsigned int read_child_rr; /* round-robin index of the read_child */ - gf_lock_t read_child_lock; /* lock to protect above */ +typedef enum { + AFR_DATA_TRANSACTION, /* truncate, write, ... */ + AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ + AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ +} afr_transaction_type; - xlator_t **children; +/* + xattr format: trusted.afr.volume = [x y z] + x - data pending + y - metadata pending + z - entry pending +*/ - int first_lookup; - inode_t *root_inode; +static inline int +afr_index_for_transaction_type(afr_transaction_type type) +{ + switch (type) { + case AFR_DATA_TRANSACTION: + return 0; - unsigned char *child_up; + case AFR_METADATA_TRANSACTION: + return 1; - char **pending_key; + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + return 2; + } - char *data_self_heal; /* on/off/open */ - char * data_self_heal_algorithm; /* name of algorithm */ - unsigned int data_self_heal_window_size; /* max number of pipelined - read/writes */ + return -1; /* make gcc happy */ +} - unsigned int background_self_heal_count; - unsigned int background_self_heals_started; - gf_boolean_t metadata_self_heal; /* on/off */ - gf_boolean_t entry_self_heal; /* on/off */ +static inline int +afr_index_from_ia_type(ia_type_t type) +{ + switch (type) { + case IA_IFDIR: + return afr_index_for_transaction_type(AFR_ENTRY_TRANSACTION); + case IA_IFREG: + return afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + default: + return -1; + } +} - gf_boolean_t data_change_log; /* on/off */ - gf_boolean_t metadata_change_log; /* on/off */ - gf_boolean_t entry_change_log; /* on/off */ +typedef struct { + struct gf_flock flock; + loc_t loc; + fd_t *fd; + char *basename; + unsigned char *locked_nodes; + int locked_count; - int read_child; /* read-subvolume */ - unsigned int hash_mode; /* for when read_child is not set */ - int favorite_child; /* subvolume to be preferred in resolving - split-brain cases */ +} afr_lockee_t; - gf_boolean_t inodelk_trace; - gf_boolean_t entrylk_trace; +int +afr_entry_lockee_cmp(const void *l1, const void *l2); - gf_boolean_t strict_readdir; +typedef struct { + loc_t *lk_loc; - unsigned int wait_count; /* # of servers to wait for success */ + afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; - uint64_t up_count; /* number of CHILD_UPs we have seen */ - uint64_t down_count; /* number of CHILD_DOWNs we have seen */ + const char *lk_basename; + const char *lower_basename; + const char *higher_basename; - struct _pump_private *pump_private; /* Set if we are loaded as pump */ - int use_afr_in_pump; + unsigned char *lower_locked_nodes; - pthread_mutex_t mutex; - struct list_head saved_fds; /* list of fds on which locks have succeeded */ - gf_boolean_t optimistic_change_log; - gf_boolean_t eager_lock; - uint32_t post_op_delay_secs; - unsigned int quorum_count; + afr_lock_cbk_t lock_cbk; - char vol_uuid[UUID_SIZE + 1]; - int32_t *last_event; - afr_self_heald_t shd; - gf_boolean_t choose_local; - gf_boolean_t did_discovery; - gf_boolean_t readdir_failover; - uint64_t sh_readdir_size; - gf_boolean_t ensure_durability; - char *sh_domain; -} afr_private_t; + int lockee_count; + + int32_t lk_call_count; + int32_t lk_expected_count; + int32_t lk_attempted_count; + + int32_t lock_op_ret; + int32_t lock_op_errno; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ + int32_t lock_count; + char lower_locked; + char higher_locked; +} afr_internal_lock_t; + +struct afr_reply { + int valid; + int32_t op_ret; + dict_t *xattr; /*For xattrop*/ + dict_t *xdata; + struct iatt poststat; + struct iatt postparent; + struct iatt prestat; + struct iatt preparent; + struct iatt preparent2; + struct iatt postparent2; + int32_t op_errno; + /* For rchecksum */ + uint8_t checksum[SHA256_DIGEST_LENGTH]; + gf_boolean_t buf_has_zeroes; + gf_boolean_t fips_mode_rchecksum; + /* For lookup */ + int8_t need_heal; +}; typedef enum { - AFR_SELF_HEAL_NOT_ATTEMPTED, - AFR_SELF_HEAL_STARTED, - AFR_SELF_HEAL_FAILED, - AFR_SELF_HEAL_SYNC_BEGIN, -} afr_self_heal_status; + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; typedef struct { - afr_self_heal_status gfid_or_missing_entry_self_heal; - afr_self_heal_status metadata_self_heal; - afr_self_heal_status data_self_heal; - afr_self_heal_status entry_self_heal; -} afr_sh_status_for_all_type; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ + int flags; + + /* the subvolume on which the latest sequence of readdirs (starting + at offset 0) has begun. Till the next readdir request with 0 offset + arrives, we continue to read off this subvol. + */ + int readdir_subvol; + /* lock-healing related members. */ + gf_boolean_t is_fd_bad; + afr_lk_heal_info_t *lk_heal_info; -typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; +} afr_fd_ctx_t; typedef enum { - AFR_CHECK_ALL, - AFR_CHECK_SPECIFIC, -} afr_sh_fail_check_type; - -struct afr_self_heal_ { - /* External interface: These are variables (some optional) that - are set by whoever has triggered self-heal */ - - gf_boolean_t do_data_self_heal; - gf_boolean_t do_metadata_self_heal; - gf_boolean_t do_entry_self_heal; - gf_boolean_t do_gfid_self_heal; - gf_boolean_t do_missing_entry_self_heal; - gf_boolean_t force_confirm_spb; /* Check for split-brains even when - self-heal is turned off */ - - gf_boolean_t forced_merge; /* Is this a self-heal triggered to - forcibly merge the directories? */ - - gf_boolean_t background; /* do self-heal in background - if possible */ - ia_type_t type; /* st_mode of the entry we're doing - self-heal on */ - inode_t *inode; /* inode on which the self-heal is - performed on */ - uuid_t sh_gfid_req; /* gfid self-heal needs to be done - with this gfid if it is not null */ - - /* Function to call to unwind. If self-heal is being done in the - background, this function will be called as soon as possible. */ - - int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno, int32_t sh_failed); - - /* End of external interface members */ - - - /* array of stat's, one for each child */ - struct iatt *buf; - struct iatt *parentbufs; - struct iatt parentbuf; - struct iatt entrybuf; - - afr_expunge_done_cbk_t expunge_done; - afr_impunge_done_cbk_t impunge_done; - - /* array of xattr's, one for each child */ - dict_t **xattr; - - /* array containing if the lookups succeeded in the order of response - */ - int32_t *success_children; - int success_count; - /* array containing the fresh children found in the self-heal process */ - int32_t *fresh_children; - /* array containing the fresh children found in the parent lookup */ - int32_t *fresh_parent_dirs; - /* array of errno's, one for each child */ - int *child_errno; - /*loc used for lookup*/ - loc_t lookup_loc; - int32_t lookup_flags; - afr_lookup_done_cbk_t lookup_done; - - int32_t **pending_matrix; - int32_t **delta_matrix; - - int32_t op_ret; - int32_t op_errno; - - int *sources; - int source; - int active_source; - int active_sinks; - unsigned char *success; - unsigned char *locked_nodes; - int lock_count; - - const char *linkname; - gf_boolean_t entries_skipped; - - gf_boolean_t actual_sh_started; - gf_boolean_t sync_done; - gf_boolean_t data_lock_held; - gf_boolean_t sh_dom_lock_held; - gf_boolean_t eof_reached; - fd_t *healing_fd; - int file_has_holes; - blksize_t block_size; - off_t file_size; - off_t offset; - unsigned char *write_needed; - uint8_t *checksum; - afr_post_remove_call_t post_remove_call; - - char *data_sh_info; - char *metadata_sh_info; + AFR_FOP_LOCK_PARALLEL, + AFR_FOP_LOCK_SERIAL, + AFR_FOP_LOCK_QUORUM_FAILED, +} afr_fop_lock_state_t; + +typedef struct _afr_inode_lock_t { + /* @num_inodelks: + Number of inodelks queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + int32_t num_inodelks; + unsigned int event_generation; + gf_timer_t *delay_timer; + struct list_head owners; /*Transactions that are performing fop*/ + struct list_head post_op; /*Transactions that are done with the fop + *So can not conflict with the fops*/ + struct list_head waiting; /*Transaction that are waiting for + *conflicting transactions to complete*/ + struct list_head frozen; /*Transactions that need to go as part of + * next batch of eager-lock*/ + gf_boolean_t release; + gf_boolean_t acquired; +} afr_lock_t; + +typedef struct _afr_inode_ctx { + uint64_t read_subvol; + uint64_t write_subvol; + int lock_count; + int spb_choice; + gf_timer_t *timer; + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ + afr_lock_t lock[2]; + + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + uint32_t open_fd_count; + gf_boolean_t need_refresh; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; +} afr_inode_ctx_t; - loc_t parent_loc; - call_frame_t *orig_frame; - call_frame_t *old_loop_frame; - gf_boolean_t unwound; - - afr_sh_algo_private_t *private; - afr_sh_status_for_all_type afr_all_sh_status; - afr_self_heal_type sh_type_in_action; - - struct afr_sh_algorithm *algo; - afr_lock_cbk_t data_lock_success_handler; - afr_lock_cbk_t data_lock_failure_handler; - gf_boolean_t data_lock_block; - int (*completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); - int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - - call_frame_t *sh_frame; -}; +typedef struct _afr_local { + glusterfs_fop_t op; + unsigned int call_count; -typedef struct afr_self_heal_ afr_self_heal_t; + /* @event_generation: copy of priv->event_generation taken at the + time of starting the transaction. The copy is made so that we + have a stable value through the various phases of the transaction. + */ + unsigned int event_generation; -typedef enum { - AFR_DATA_TRANSACTION, /* truncate, write, ... */ - AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ - AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ - AFR_ENTRY_RENAME_TRANSACTION, /* rename */ -} afr_transaction_type; + uint32_t open_fd_count; + int32_t num_inodelks; -typedef enum { - AFR_TRANSACTION_LK, - AFR_SELFHEAL_LK, -} transaction_lk_type_t; + int32_t op_ret; + int32_t op_errno; -typedef enum { - AFR_LOCK_OP, - AFR_UNLOCK_OP, -} afr_lock_op_type_t; + int dirty[AFR_NUM_CHANGE_LOGS]; -typedef enum { - AFR_DATA_SELF_HEAL_LK, - AFR_METADATA_SELF_HEAL_LK, - AFR_ENTRY_SELF_HEAL_LK, -}selfheal_lk_type_t; + int32_t **pending; -typedef enum { - AFR_INODELK_TRANSACTION, - AFR_INODELK_NB_TRANSACTION, - AFR_ENTRYLK_TRANSACTION, - AFR_ENTRYLK_NB_TRANSACTION, - AFR_INODELK_SELFHEAL, - AFR_INODELK_NB_SELFHEAL, - AFR_ENTRYLK_SELFHEAL, - AFR_ENTRYLK_NB_SELFHEAL, -} afr_lock_call_type_t; + loc_t loc; + loc_t newloc; -/* - xattr format: trusted.afr.volume = [x y z] - x - data pending - y - metadata pending - z - entry pending -*/ + fd_t *fd; + afr_fd_ctx_t *fd_ctx; -static inline int -afr_index_for_transaction_type (afr_transaction_type type) -{ - switch (type) { + /* @child_up: copy of priv->child_up taken at the time of transaction + start. The copy is taken so that we have a stable child_up array + through the phases of the transaction as priv->child_up[i] can keep + changing through time. + */ + unsigned char *child_up; - case AFR_DATA_TRANSACTION: - return 0; + /* @read_attempted: + array of flags representing subvolumes where read operations of + the read transaction have already been attempted. The array is + first pre-filled with down subvolumes, and as reads are performed + on other subvolumes, those are set as well. This way if the read + operation fails we do not retry on that subvolume again. + */ + unsigned char *read_attempted; - case AFR_METADATA_TRANSACTION: - return 1; + /* @readfn: - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - return 2; - } + pointer to function which will perform the read operation on a given + subvolume. Used in read transactions. + */ - return -1; /* make gcc happy */ -} + afr_read_txn_wind_t readfn; -typedef struct { - loc_t loc; - char *basename; - unsigned char *locked_nodes; - int locked_count; + /* @inode: -} afr_entry_lockee_t; + the inode on which the read txn is performed on. ref'ed and copied + from either fd->inode or loc.inode + */ -int -afr_entry_lockee_cmp (const void *l1, const void *l2); + inode_t *inode; -typedef struct { - char *domain; /* Domain on which inodelk is taken */ - struct gf_flock flock; - unsigned char *locked_nodes; - int32_t lock_count; -} afr_inodelk_t; + /* @parent[2]: -typedef struct { - loc_t *lk_loc; + parent inode[s] on which directory transactions are performed. + */ - int lockee_count; - afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + inode_t *parent; + inode_t *parent2; - afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; - const char *lk_basename; - const char *lower_basename; - const char *higher_basename; - char lower_locked; - char higher_locked; + /* @readable: - unsigned char *locked_nodes; - unsigned char *lower_locked_nodes; + array of flags representing servers from which a read can be + performed. This is the output of afr_inode_refresh() + */ + unsigned char *readable; + unsigned char *readable2; /*For rename transaction*/ - selfheal_lk_type_t selfheal_lk_type; - transaction_lk_type_t transaction_lk_type; + afr_inode_refresh_cbk_t refreshfn; - int32_t lock_count; - int32_t entrylk_lock_count; + /* @refreshinode: - uint64_t lock_number; - int32_t lk_call_count; - int32_t lk_expected_count; - int32_t lk_attempted_count; + Inode currently getting refreshed. + */ + inode_t *refreshinode; - int32_t lock_op_ret; - int32_t lock_op_errno; - afr_lock_cbk_t lock_cbk; - char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ -} afr_internal_lock_t; + dict_t *xattr_req; -typedef struct _afr_locked_fd { - fd_t *fd; - struct list_head list; -} afr_locked_fd_t; + dict_t *dict; -struct afr_reply { - int valid; - int32_t op_ret; - int32_t op_errno; -}; + int read_subvol; /* Current read subvolume */ -typedef struct _afr_local { - int uid; - int gid; - unsigned int call_count; - unsigned int success_count; - unsigned int enoent_count; - uint32_t open_fd_count; - gf_boolean_t update_open_fd_count; + int optimistic_change_log; + afr_internal_lock_t internal_lock; - unsigned int unhealable; + /*To handle setattr/setxattr on yet to be linked inode from dht*/ + uuid_t refreshgfid; - unsigned int read_child_index; - unsigned char read_child_returned; - unsigned int first_up_child; + /* @refreshed: - gf_lkowner_t saved_lk_owner; + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; - int32_t op_ret; - int32_t op_errno; + gf_boolean_t update_num_inodelks; + gf_boolean_t update_open_fd_count; - int32_t **pending; + /* + @pre_op_compat: - loc_t loc; - loc_t newloc; + compatibility mode of pre-op. send a separate pre-op and + op operations as part of transaction, rather than combining + */ - fd_t *fd; + gf_boolean_t pre_op_compat; - glusterfs_fop_t fop; + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; - unsigned char *child_up; - int32_t *fresh_children; //in the order of response + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; - int32_t *child_errno; + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops + */ - dict_t *xattr_req; + struct { + struct { + struct statvfs buf; + unsigned char buf_set; + } statfs; - int32_t inodelk_count; - int32_t entrylk_count; + struct { + fd_t *fd; + int32_t flags; + } open; - afr_internal_lock_t internal_lock; + struct { + struct gf_flock user_flock; + struct gf_flock ret_flock; + unsigned char *locked_nodes; + int32_t cmd; + /*For lock healing only.*/ + unsigned char *dom_locked_nodes; + int32_t *dom_lock_op_ret; + int32_t *dom_lock_op_errno; + struct gf_flock *getlk_rsp; + } lk; + + /* inode read */ - afr_locked_fd_t *locked_fd; - int32_t source_child; - int32_t lock_recovery_child; + struct { + int32_t mask; + int last_index; /* index of the child we tried previously */ + } access; - dict_t *dict; - int optimistic_change_log; - gf_boolean_t delayed_post_op; + struct { + int last_index; + } stat; - /* Is the current writev() going to perform a stable write? - i.e, is fd->flags or @flags writev param have O_SYNC or - O_DSYNC? - */ - gf_boolean_t stable_write; + struct { + int last_index; + } fstat; - /* This write appended to the file. Nnot necessarily O_APPEND, - just means the offset of write was at the end of file. - */ - gf_boolean_t append_write; + struct { + size_t size; + int last_index; + } readlink; - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops - */ + struct { + char *name; + long xattr_len; + int last_index; + } getxattr; + + struct { + size_t size; + off_t offset; + int last_index; + uint32_t flags; + } readv; + + /* dir read */ + + struct { + uint32_t *checksum; + int success_count; + int32_t op_ret; + int32_t op_errno; + } opendir; + + struct { + int32_t op_ret; + int32_t op_errno; + size_t size; + off_t offset; + dict_t *dict; + int last_index; + gf_boolean_t failed; + } readdir; + /* inode write */ + + struct { + struct iatt prebuf; + struct iatt postbuf; + } inode_wfop; // common structure for all inode-write-fops + + struct { + struct iovec *vector; + struct iobref *iobref; + off_t offset; + int32_t op_ret; + int32_t count; + uint32_t flags; + } writev; + + struct { + off_t offset; + } truncate; + + struct { + off_t offset; + } ftruncate; + + struct { + struct iatt in_buf; + int32_t valid; + } setattr; - int op; struct { - struct { - unsigned char buf_set; - struct statvfs buf; - } statfs; - - struct { - uint32_t parent_entrylk; - uuid_t gfid_req; - inode_t *inode; - struct iatt buf; - struct iatt postparent; - dict_t **xattrs; - dict_t *xattr; - struct iatt *postparents; - struct iatt *bufs; - int32_t read_child; - int32_t *sources; - int32_t *success_children; - int32_t **pending_matrix; - gf_boolean_t fresh_lookup; - gf_boolean_t possible_spb; - } lookup; - - struct { - int32_t flags; - } open; - - struct { - int32_t cmd; - struct gf_flock user_flock; - struct gf_flock ret_flock; - unsigned char *locked_nodes; - } lk; - - /* inode read */ - - struct { - int32_t mask; - int last_index; /* index of the child we tried previously */ - } access; - - struct { - int last_index; - } stat; - - struct { - int last_index; - } fstat; - - struct { - size_t size; - int last_index; - } readlink; - - struct { - char *name; - int last_index; - long xattr_len; - } getxattr; - - struct { - size_t size; - off_t offset; - int last_index; - uint32_t flags; - } readv; - - /* dir read */ - - struct { - int success_count; - int32_t op_ret; - int32_t op_errno; - - uint32_t *checksum; - } opendir; - - struct { - int32_t op_ret; - int32_t op_errno; - size_t size; - off_t offset; - dict_t *dict; - gf_boolean_t failed; - int last_index; - } readdir; - /* inode write */ - - struct { - struct iatt prebuf; - struct iatt postbuf; - } inode_wfop; //common structure for all inode-write-fops - - struct { - int32_t op_ret; - - struct iovec *vector; - struct iobref *iobref; - int32_t count; - off_t offset; - uint32_t flags; - } writev; - - struct { - off_t offset; - } truncate; - - struct { - off_t offset; - } ftruncate; - - struct { - struct iatt in_buf; - int32_t valid; - } setattr; - - struct { - struct iatt in_buf; - int32_t valid; - } fsetattr; - - struct { - dict_t *dict; - int32_t flags; - } setxattr; - - struct { - dict_t *dict; - int32_t flags; - } fsetxattr; - - struct { - char *name; - } removexattr; - - struct { - dict_t *xattr; - } xattrop; - - struct { - dict_t *xattr; - } fxattrop; - - /* dir write */ - - struct { - inode_t *inode; - struct iatt buf; - struct iatt preparent; - struct iatt postparent; - struct iatt prenewparent; - struct iatt postnewparent; - } dir_fop; //common structure for all dir fops - - struct { - fd_t *fd; - dict_t *params; - int32_t flags; - mode_t mode; - } create; - - struct { - dev_t dev; - mode_t mode; - dict_t *params; - } mknod; - - struct { - int32_t mode; - dict_t *params; - } mkdir; - - struct { - int flags; - } rmdir; - - struct { - dict_t *params; - char *linkpath; - } symlink; - - struct { - int32_t mode; - off_t offset; - size_t len; - } fallocate; - - struct { - off_t offset; - size_t len; - } discard; - - } cont; + struct iatt in_buf; + int32_t valid; + } fsetattr; struct { - off_t start, len; + dict_t *dict; + int32_t flags; + } setxattr; - gf_boolean_t eager_lock_on; - int *eager_lock; + struct { + dict_t *dict; + int32_t flags; + } fsetxattr; - char *basename; - char *new_basename; + struct { + char *name; + } removexattr; - loc_t parent_loc; - loc_t new_parent_loc; + struct { + dict_t *xattr; + gf_xattrop_flags_t optype; + } xattrop; - afr_transaction_type type; + /* dir write */ - /* pre-compute the post piggyback status before - entering POST-OP phase - */ - int *postop_piggybacked; + struct { + inode_t *inode; + struct iatt buf; + struct iatt preparent; + struct iatt postparent; + struct iatt prenewparent; + struct iatt postnewparent; + } dir_fop; // common structure for all dir fops - /* stub to resume on destruction - of the transaction frame */ - call_stub_t *resume_stub; + struct { + fd_t *fd; + dict_t *params; + int32_t flags; + mode_t mode; + } create; - struct list_head eager_locked; + struct { + dict_t *params; + dev_t dev; + mode_t mode; + } mknod; - int32_t **txn_changelog;//changelog after pre+post ops - unsigned char *pre_op; + struct { + dict_t *params; + int32_t mode; + } mkdir; - call_frame_t *main_frame; + struct { + dict_t *params; + char *linkpath; + } symlink; - int (*fop) (call_frame_t *frame, xlator_t *this); + struct { + off_t offset; + size_t len; + int32_t mode; + } fallocate; - int (*done) (call_frame_t *frame, xlator_t *this); + struct { + off_t offset; + size_t len; + } discard; - int (*resume) (call_frame_t *frame, xlator_t *this); + struct { + off_t offset; + off_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; - int (*unwind) (call_frame_t *frame, xlator_t *this); + struct { + char *volume; + int32_t cmd; + int32_t in_cmd; + struct gf_flock in_flock; + struct gf_flock flock; + void *xdata; + } inodelk; - /* post-op hook */ - } transaction; + struct { + char *volume; + char *basename; + void *xdata; + entrylk_cmd in_cmd; + entrylk_cmd cmd; + entrylk_type type; + } entrylk; - afr_self_heal_t self_heal; + struct { + off_t offset; + gf_seek_what_t what; + } seek; - struct marker_str marker; + struct { + struct gf_lease user_lease; + struct gf_lease ret_lease; + unsigned char *locked_nodes; + } lease; - /* extra data for fops */ - dict_t *xdata_req; - dict_t *xdata_rsp; + struct { + int flags; + } rmdir; - mode_t umask; - int xflag; - gf_boolean_t do_discovery; - struct afr_reply *replies; -} afr_local_t; + struct { + int32_t datasync; + } fsync; -typedef enum { - AFR_FD_NOT_OPENED, - AFR_FD_OPENED, - AFR_FD_OPENING -} afr_fd_open_status_t; + struct { + uuid_t gfid_req; + gf_boolean_t needs_fresh_lookup; + } lookup; -typedef struct { - unsigned int *pre_op_done; - afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ - unsigned int *pre_op_piggyback; + } cont; - unsigned int *lock_piggyback; - unsigned int *lock_acquired; + struct { + char *basename; + char *new_basename; - int flags; - uint64_t up_count; /* number of CHILD_UPs this fd has seen */ - uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ + loc_t parent_loc; + loc_t new_parent_loc; - int32_t last_tried; + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; - int hit, miss; - gf_boolean_t failed_over; - struct list_head entries; /* needed for readdir failover */ + struct list_head owner_list; + struct list_head wait_list; - unsigned char *locked_on; /* which subvolumes locks have been successful */ + unsigned char *pre_op; - /* used for delayed-post-op optimization */ - pthread_mutex_t delay_lock; - gf_timer_t *delay_timer; - call_frame_t *delay_frame; - int call_child; + /* Changelog xattr dict for [f]xattrop*/ + dict_t **changelog_xdata; + unsigned char *pre_op_sources; - /* set if any write on this fd was a non stable write - (i.e, without O_SYNC or O_DSYNC) - */ - gf_boolean_t witnessed_unstable_write; + /* @failed_subvols: subvolumes on which a pre-op or a + FOP failed. */ + unsigned char *failed_subvols; - /* list of frames currently in progress */ - struct list_head eager_locked; -} afr_fd_ctx_t; + call_frame_t *main_frame; /*Fop frame*/ + call_frame_t *frame; /*Transaction frame*/ + int (*wind)(call_frame_t *frame, xlator_t *this, int subvol); -/* try alloc and if it fails, goto label */ -#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \ - var = mem_get0 (THIS->local_pool); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ - } while (0); + int (*unwind)(call_frame_t *frame, xlator_t *this); + off_t start, len; -/* did a call fail due to a child failing? */ -#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ - ((op_errno == ENOTCONN) || \ - (op_errno == EBADFD))) + afr_transaction_type type; -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) + int32_t in_flight_sb_errno; /* This is where the cause of the + failure on the last good copy of + the file is stored. + */ -/* have we tried all children? */ -#define all_tried(i, count) ((i) == (count) - 1) + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + afr_changelog_resume_t changelog_resume; -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid); + gf_boolean_t eager_lock_on; + gf_boolean_t do_eager_unlock; -int -pump_command_reply (call_frame_t *frame, xlator_t *this); + /* @dirtied: flag which indicates whether we set dirty flag + in the OP. Typically true when we are performing operation + on more than one subvol and optimistic changelog is disabled -int32_t -afr_notify (xlator_t *this, int32_t event, void *data, void *data2); + A 'true' value set in @dirtied flag means an 'undirtying' + has to be done in POST-OP phase. + */ + gf_boolean_t dirtied; -int -afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, - loc_t *loc, char *basename, int child_count); + /* @inherited: flag which indicates that the dirty flags + of the previous transaction were inherited + */ + gf_boolean_t inherited; -void -afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); + /* + @no_uninherit: flag which indicates that a pre_op_uninherit() + must _not_ be attempted (and returned as failure) always. This + flag is set when a hard pre-op is performed, but not accounted + for it in fd_ctx->on_disk[]. Such transactions are "isolated" + from the pre-op piggybacking entirely and therefore uninherit + must not be attempted. + */ + gf_boolean_t no_uninherit; + + gf_boolean_t in_flight_sb; /* Indicator for occurrence of + split-brain while in the middle of + a txn. */ + /* @uninherit_done: + @uninherit_value: + + The above pair variables make pre_op_uninherit() idempotent. + Both are FALSE initially. The first call to pre_op_uninherit + sets @uninherit_done to TRUE and the return value to + @uninherit_value. Further calls will check for @uninherit_done + to be TRUE and if so will simply return @uninherit_value. + */ + gf_boolean_t uninherit_done; + gf_boolean_t uninherit_value; + + gf_boolean_t disable_delayed_post_op; + } transaction; + + syncbarrier_t barrier; + + /* extra data for fops */ + dict_t *xdata_req; + dict_t *xdata_rsp; + + dict_t *xattr_rsp; /*for [f]xattrop*/ + + mode_t umask; + int xflag; + struct afr_reply *replies; + + /* For client side background heals. */ + struct list_head healer; + call_frame_t *heal_frame; + + afr_inode_ctx_t *inode_ctx; + + /*For thin-arbiter transactions.*/ + int ta_failed_subvol; + int ta_event_gen; + struct list_head ta_waitq; + struct list_head ta_onwireq; + afr_ta_fop_state_t fop_state; + afr_fop_lock_state_t fop_lock_state; + gf_lkowner_t saved_lk_owner; + unsigned char read_txn_query_child; + unsigned char ta_child_up; + gf_boolean_t do_discovery; + gf_boolean_t need_full_crawl; + gf_boolean_t is_read_txn; + gf_boolean_t is_new_entry; +} afr_local_t; + +typedef struct afr_spbc_timeout { + call_frame_t *frame; + loc_t *loc; + int spb_child_index; + gf_boolean_t d_spb; + gf_boolean_t m_spb; +} afr_spbc_timeout_t; + +typedef struct afr_spb_status { + call_frame_t *frame; + loc_t *loc; +} afr_spb_status_t; + +typedef struct afr_empty_brick_args { + call_frame_t *frame; + char *op_type; + loc_t loc; + int empty_index; +} afr_empty_brick_args_t; + +typedef struct afr_read_subvol_args { + ia_type_t ia_type; + uuid_t gfid; +} afr_read_subvol_args_t; + +typedef struct afr_granular_esh_args { + fd_t *heal_fd; + xlator_t *xl; + call_frame_t *frame; + gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid + mismatch */ +} afr_granular_esh_args_t; + +int +afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type); int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); +afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); int -afr_save_locked_fd (xlator_t *this, fd_t *fd); +__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvol, + int event_generation); +int +afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int event_generation); int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, - unsigned char *locked_nodes); +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); -void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); +int +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); int -afr_set_lock_number (call_frame_t *frame, xlator_t *this); +afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, + unsigned char *readable, + afr_read_subvol_args_t *args); +int +afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type); +int +afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p, + unsigned char *readables, int *event_p, + afr_transaction_type type, afr_read_subvol_args_t *args); -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); +#define afr_data_subvol_get(i, t, s, r, e, a) \ + afr_read_subvol_get(i, t, s, r, e, AFR_DATA_TRANSACTION, a) -int32_t -afr_unlock (call_frame_t *frame, xlator_t *this); +#define afr_metadata_subvol_get(i, t, s, r, e, a) \ + afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a) int -afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this); +afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, afr_inode_refresh_cbk_t cbk); -int -afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this); +int32_t +afr_notify(xlator_t *this, int32_t event, void *data, void *data2); int -afr_blocking_lock (call_frame_t *frame, xlator_t *this); +xattr_is_equal(dict_t *this, char *key1, data_t *value1, void *data); int -afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); +afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename, + int child_count); int -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, - unsigned int child_count); +afr_add_inode_lockee(afr_local_t *local, int child_count); -int pump_start (call_frame_t *frame, xlator_t *this); +void +afr_lockees_cleanup(afr_internal_lock_t *int_lock); int -__afr_fd_ctx_set (xlator_t *this, fd_t *fd); +afr_attempt_lock_recovery(xlator_t *this, int32_t child_index); int -afr_fd_ctx_set (xlator_t *this, fd_t *fd); - -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children); +afr_mark_locked_nodes(xlator_t *this, fd_t *fd, unsigned char *locked_nodes); void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children); +afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner); int -afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); +afr_set_lock_number(call_frame_t *frame, xlator_t *this); -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count); +int32_t +afr_unlock(call_frame_t *frame, xlator_t *this); -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count); +int +afr_lock_nonblocking(call_frame_t *frame, xlator_t *this); -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count); +int +afr_blocking_lock(call_frame_t *frame, xlator_t *this); -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this); +int +afr_internal_lock_finish(call_frame_t *frame, xlator_t *this); -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent); +int +__afr_fd_ctx_set(xlator_t *this, fd_t *fd); + +afr_fd_ctx_t * +afr_fd_ctx_get(fd_t *fd, xlator_t *this); int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); +afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno); -void -afr_local_cleanup (afr_local_t *local, xlator_t *this); +int +afr_locked_nodes_count(unsigned char *locked_nodes, int child_count); int -afr_frame_return (call_frame_t *frame); +afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode, + gf_boolean_t *start_heal); -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode); +void +afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv); void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb); +afr_local_cleanup(afr_local_t *local, xlator_t *this); int -afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata); +afr_frame_return(call_frame_t *frame); -void -afr_set_opendir_done (xlator_t *this, inode_t *inode); - -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode); +int +afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); void -afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); +afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this); int -afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); +afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd); + +#define AFR_STACK_UNWIND(fop, frame, op_ret, op_errno, params...) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + int32_t __op_ret = 0; \ + int32_t __op_errno = 0; \ + \ + __op_ret = op_ret; \ + __op_errno = op_errno; \ + if (frame) { \ + __local = frame->local; \ + __this = frame->this; \ + afr_handle_inconsistent_fop(frame, &__op_ret, &__op_errno); \ + if (__local && __local->is_read_txn) \ + afr_pending_read_decrement(__this->private, \ + __local->read_subvol); \ + if (__local && __local->xdata_req && \ + afr_is_lock_mode_mandatory(__local->xdata_req)) \ + afr_dom_lock_release(frame); \ + frame->local = NULL; \ + } \ + \ + STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params); \ + if (__local) { \ + afr_local_cleanup(__local, __this); \ + mem_put(__local); \ + } \ + } while (0) + +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY(frame->root); \ + if (__local) { \ + afr_local_cleanup(__local, __this); \ + mem_put(__local); \ + } \ + } while (0); + +#define AFR_FRAME_INIT(frame, op_errno) \ + ({ \ + frame->local = mem_get0(THIS->local_pool); \ + if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \ + afr_local_cleanup(frame->local, frame->this); \ + mem_put(frame->local); \ + frame->local = NULL; \ + }; \ + frame->local; \ + }) + +#define AFR_STACK_RESET(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + int __opr; \ + STACK_RESET(frame->root); \ + if (__local) { \ + afr_local_cleanup(__local, __this); \ + mem_put(__local); \ + } \ + AFR_FRAME_INIT(frame, __opr); \ + } while (0) -int -afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); - -#define AFR_STACK_UNWIND(fop, frame, params ...) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - if (frame) { \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - if (__local) { \ - afr_local_cleanup (__local, __this); \ - mem_put (__local); \ - } \ - } while (0) - -#define AFR_STACK_DESTROY(frame) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - if (__local) { \ - afr_local_cleanup (__local, __this); \ - mem_put (__local); \ - } \ - } while (0); - -#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ /* allocate and return a string that is the basename of argument */ static inline char * -AFR_BASENAME (const char *str) +AFR_BASENAME(const char *str) { - char *__tmp_str = NULL; - char *__basename_str = NULL; - __tmp_str = gf_strdup (str); - __basename_str = gf_strdup (basename (__tmp_str)); - GF_FREE (__tmp_str); - return __basename_str; + char *__tmp_str = NULL; + char *__basename_str = NULL; + __tmp_str = gf_strdup(str); + __basename_str = gf_strdup(basename(__tmp_str)); + GF_FREE(__tmp_str); + return __basename_str; } +call_frame_t * +afr_copy_frame(call_frame_t *base); + int -afr_transaction_local_init (afr_local_t *local, xlator_t *this); +afr_transaction_local_init(afr_local_t *local, xlator_t *this); int32_t -afr_marker_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); +afr_marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, afr_local_t *local, afr_private_t *priv); -int32_t * -afr_children_create (int32_t child_count); +int +afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno); int -afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); +afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count); int -afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, - transaction_lk_type_t lk_type); +afr_higher_errno(int32_t old_errno, int32_t new_errno); int -afr_first_up_child (unsigned char *child_up, size_t child_count); +afr_final_errno(afr_local_t *local, afr_private_t *priv); int -afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid); +afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req); void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid); +afr_fix_open(fd_t *fd, xlator_t *this); -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index); +afr_fd_ctx_t * +afr_fd_ctx_get(fd_t *fd, xlator_t *this); -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child); -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *children, unsigned int child_count); void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count); +afr_set_low_priority(call_frame_t *frame); +int +afr_child_fd_ctx_set(xlator_t *this, fd_t *fd, int32_t child, int flags); + void -afr_children_rm_child (int32_t *children, int32_t child, - int32_t child_count); +afr_matrix_cleanup(int32_t **pending, unsigned int m); + +int32_t ** +afr_matrix_create(unsigned int m, unsigned int n); + +int ** +afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending, + dict_t *xattr, ia_type_t iat); + void -afr_reset_children (int32_t *children, int32_t child_count); -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio); -int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno); +afr_filter_xattrs(dict_t *xattr); + +/* + * Special value indicating we should use the "auto" quorum method instead of + * a fixed value (including zero to turn off quorum enforcement). + */ +#define AFR_QUORUM_AUTO INT_MAX + int -afr_get_children_count (int32_t *children, unsigned int child_count); +afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local); + gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child); +afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode); + void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count); +afr_reply_wipe(struct afr_reply *reply); + void -afr_reset_xattr (dict_t **xattr, unsigned int child_count); +afr_replies_wipe(struct afr_reply *replies, int count); + gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name); -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *children, - struct iatt *bufs, unsigned int child_count, - const char *path); -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path); +afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2); + +gf_boolean_t +afr_is_xattr_ignorable(char *key); + +int +afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc); + +int +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc); + +int +afr_get_split_brain_status(void *opaque); + +int +afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque); + +int +afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, + int spb_choice); +int +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol); +int +afr_get_child_index_from_name(xlator_t *this, char *name); + +int +afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb); +int +afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode); + +int +afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque); + +gf_boolean_t +afr_get_need_heal(xlator_t *this); + void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count); -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type); +afr_set_need_heal(xlator_t *this, afr_local_t *local); + +int +afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd); + +int +afr_get_msg_id(char *op_type); + +int +afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, + inode_t *inode); + int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count); +afr_quorum_errno(afr_private_t *priv); + +gf_boolean_t +afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, + int32_t *op_errno); +void +afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret, + int32_t *op_errno); + void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children); +afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)); +afr_process_post_writev(call_frame_t *frame, xlator_t *this); + void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); +afr_writev_unwind(call_frame_t *frame, xlator_t *this); void -afr_open_fd_fix (fd_t *fd, xlator_t *this); -int -afr_set_elem_count_get (unsigned char *elems, int child_count); +afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame); +void +afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock, + int32_t child_index); afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this); +__afr_fd_ctx_get(fd_t *fd, xlator_t *this); gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal); +afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1, + int event_gen2); +int +afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this, + char *buf, const char *default_str, + int32_t *serz_len, char delimiter); gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal); +afr_is_symmetric_error(call_frame_t *frame, xlator_t *this); -void -afr_set_low_priority (call_frame_t *frame); int -afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, - int flags); +__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx); -gf_boolean_t -afr_have_quorum (char *logname, afr_private_t *priv); +uint64_t +afr_write_subvol_get(call_frame_t *frame, xlator_t *this); -void -afr_matrix_cleanup (int32_t **pending, unsigned int m); +int +afr_write_subvol_set(call_frame_t *frame, xlator_t *this); -int32_t** -afr_matrix_create (unsigned int m, unsigned int n); +int +afr_write_subvol_reset(call_frame_t *frame, xlator_t *this); -gf_boolean_t -afr_is_errno_set (int *child_errno, int child); +int +afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode); + +int +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop); + +int +afr_ta_post_op_lock(xlator_t *this, loc_t *loc); + +int +afr_ta_post_op_unlock(xlator_t *this, loc_t *loc); gf_boolean_t -afr_is_errno_unset (int *child_errno, int child); +afr_is_pending_set(xlator_t *this, dict_t *xdata, int type); + +int +__afr_get_up_children_count(afr_private_t *priv); + +call_frame_t * +afr_ta_frame_create(xlator_t *this); gf_boolean_t -afr_is_fd_fixable (fd_t *fd); +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local); void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count); +afr_ta_lock_release_synctask(xlator_t *this); + void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); -/* - * Special value indicating we should use the "auto" quorum method instead of - * a fixed value (including zero to turn off quorum enforcement). - */ -#define AFR_QUORUM_AUTO INT_MAX +afr_ta_locked_priv_invalidate(afr_private_t *priv); -/* - * Having this as a macro will make debugging a bit weirder, but does reduce - * the probability of functions handling this check inconsistently. - */ -#define QUORUM_CHECK(_func,_label) do { \ - if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ - gf_log(this->name,GF_LOG_WARNING, \ - "failing "#_func" due to lack of quorum"); \ - op_errno = EROFS; \ - goto _label; \ - } \ -} while (0); +gf_boolean_t +afr_lookup_has_quorum(call_frame_t *frame, + const unsigned int up_children_count); +void +afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this); -int -afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); +void +afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this); gf_boolean_t -afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); +afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, + int child); void -afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); +afr_selfheal_childup(xlator_t *this, afr_private_t *priv); -int -afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata); void -afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); +afr_dom_lock_release(call_frame_t *frame); -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this); +void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies); +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c deleted file mode 100644 index a7f72fb30e9..00000000000 --- a/xlators/cluster/afr/src/pump.c +++ /dev/null @@ -1,2641 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#include <unistd.h> -#include <sys/time.h> -#include <stdlib.h> -#include <fnmatch.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "afr-common.c" -#include "defaults.c" -#include "glusterfs.h" - -static uint64_t pump_pid = 0; -static inline void -pump_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent) -{ - afr_update_loc_gfids (loc, iatt, parent); - uuid_copy (loc->inode->gfid, iatt->ia_gfid); -} - -static int -pump_mark_start_pending (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - pump_priv->pump_start_pending = 1; - - return 0; -} - -static int -is_pump_start_pending (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - return (pump_priv->pump_start_pending); -} - -static int -pump_remove_start_pending (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - pump_priv->pump_start_pending = 0; - - return 0; -} - -static pump_state_t -pump_get_state () -{ - xlator_t *this = NULL; - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - pump_state_t ret; - - this = THIS; - priv = this->private; - pump_priv = priv->pump_private; - - LOCK (&pump_priv->pump_state_lock); - { - ret = pump_priv->pump_state; - } - UNLOCK (&pump_priv->pump_state_lock); - - return ret; -} - -int -pump_change_state (xlator_t *this, pump_state_t state) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - pump_state_t state_old; - pump_state_t state_new; - - - priv = this->private; - pump_priv = priv->pump_private; - - GF_ASSERT (pump_priv); - - LOCK (&pump_priv->pump_state_lock); - { - state_old = pump_priv->pump_state; - state_new = state; - - pump_priv->pump_state = state; - - } - UNLOCK (&pump_priv->pump_state_lock); - - gf_log (this->name, GF_LOG_DEBUG, - "Pump changing state from %d to %d", - state_old, - state_new); - - return 0; -} - -static int -pump_set_resume_path (xlator_t *this, const char *path) -{ - int ret = 0; - - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - GF_ASSERT (pump_priv); - - LOCK (&pump_priv->resume_path_lock); - { - strncpy (pump_priv->resume_path, path, strlen (path) + 1); - } - UNLOCK (&pump_priv->resume_path_lock); - - return ret; -} - -static int -pump_save_path (xlator_t *this, const char *path) -{ - afr_private_t *priv = NULL; - pump_state_t state; - dict_t *dict = NULL; - loc_t loc = {0}; - int dict_ret = 0; - int ret = -1; - - state = pump_get_state (); - if (state == PUMP_STATE_RESUME) - return 0; - - priv = this->private; - - GF_ASSERT (priv->root_inode); - - afr_build_root_loc (this, &loc); - - dict = dict_new (); - dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path); - if (dict_ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set the key %s", path, PUMP_PATH); - - ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); - - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "setxattr failed - could not save path=%s", path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "setxattr succeeded - saved path=%s", path); - } - - dict_unref (dict); - - loc_wipe (&loc); - return 0; -} - -static int -pump_check_and_update_status (xlator_t *this) -{ - pump_state_t state; - int ret = -1; - - state = pump_get_state (); - - switch (state) { - - case PUMP_STATE_RESUME: - case PUMP_STATE_RUNNING: - { - ret = 0; - break; - } - case PUMP_STATE_PAUSE: - { - ret = -1; - break; - } - case PUMP_STATE_ABORT: - { - pump_save_path (this, "/"); - ret = -1; - break; - } - default: - { - gf_log (this->name, GF_LOG_DEBUG, - "Unknown pump state"); - ret = -1; - break; - } - - } - - return ret; -} - -static const char * -pump_get_resume_path (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - const char *resume_path = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - resume_path = pump_priv->resume_path; - - return resume_path; -} - -static int -pump_update_resume_state (xlator_t *this, const char *path) -{ - pump_state_t state; - const char *resume_path = NULL; - - state = pump_get_state (); - - if (state == PUMP_STATE_RESUME) { - resume_path = pump_get_resume_path (this); - if (strcmp (resume_path, "/") == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Reached the resume path (/). Proceeding to change state" - " to running"); - pump_change_state (this, PUMP_STATE_RUNNING); - } else if (strcmp (resume_path, path) == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Reached the resume path. Proceeding to change state" - " to running"); - pump_change_state (this, PUMP_STATE_RUNNING); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Not yet hit the resume path:res-path=%s,path=%s", - resume_path, path); - } - } - - return 0; -} - -static gf_boolean_t -is_pump_traversal_allowed (xlator_t *this, const char *path) -{ - pump_state_t state; - const char *resume_path = NULL; - gf_boolean_t ret = _gf_true; - - state = pump_get_state (); - - if (state == PUMP_STATE_RESUME) { - resume_path = pump_get_resume_path (this); - if (strstr (resume_path, path)) { - gf_log (this->name, GF_LOG_DEBUG, - "On the right path to resumption path"); - ret = _gf_true; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Not the right path to resuming=> ignoring traverse"); - ret = _gf_false; - } - } - - return ret; -} - -static int -pump_save_file_stats (xlator_t *this, const char *path) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - LOCK (&pump_priv->resume_path_lock); - { - pump_priv->number_files_pumped++; - - strncpy (pump_priv->current_file, path, - PATH_MAX); - } - UNLOCK (&pump_priv->resume_path_lock); - - return 0; -} - -static int -gf_pump_traverse_directory (loc_t *loc) -{ - xlator_t *this = NULL; - fd_t *fd = NULL; - off_t offset = 0; - loc_t entry_loc = {0}; - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - gf_dirent_t entries; - struct iatt iatt = {0}; - struct iatt parent = {0}; - dict_t *xattr_rsp = NULL; - int ret = 0; - gf_boolean_t is_directory_empty = _gf_true; - gf_boolean_t free_entries = _gf_false; - - INIT_LIST_HEAD (&entries.list); - this = THIS; - - GF_ASSERT (loc->inode); - - fd = fd_create (loc->inode, pump_pid); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to create fd for %s", loc->path); - goto out; - } - - ret = syncop_opendir (this, loc, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "opendir failed on %s", loc->path); - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "pump opendir on %s returned=%d", - loc->path, ret); - - while (syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) { - free_entries = _gf_true; - - if (list_empty (&entries.list)) { - gf_log (this->name, GF_LOG_TRACE, - "no more entries in directory"); - goto out; - } - - list_for_each_entry_safe (entry, tmp, &entries.list, list) { - gf_log (this->name, GF_LOG_DEBUG, - "found readdir entry=%s", entry->d_name); - - offset = entry->d_off; - if (uuid_is_null (entry->d_stat.ia_gfid)) { - gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " - "gfid present skipping", - loc->path, entry->d_name); - continue; - } - loc_wipe (&entry_loc); - ret = afr_build_child_loc (this, &entry_loc, loc, - entry->d_name); - if (ret) - goto out; - - if (!IS_ENTRY_CWD (entry->d_name) && - !IS_ENTRY_PARENT (entry->d_name)) { - - is_directory_empty = _gf_false; - gf_log (this->name, GF_LOG_DEBUG, - "lookup %s => %"PRId64, - entry_loc.path, - iatt.ia_ino); - - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, &xattr_rsp, &parent); - - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: lookup failed", - entry_loc.path); - continue; - } - pump_fill_loc_info (&entry_loc, &iatt, - &parent); - - pump_update_resume_state (this, entry_loc.path); - - pump_save_path (this, entry_loc.path); - pump_save_file_stats (this, entry_loc.path); - - ret = pump_check_and_update_status (this); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Pump beginning to exit out"); - goto out; - } - - if (IA_ISDIR (iatt.ia_type)) { - if (is_pump_traversal_allowed (this, entry_loc.path)) { - gf_log (this->name, GF_LOG_TRACE, - "entering dir=%s", - entry->d_name); - gf_pump_traverse_directory (&entry_loc); - } - } - } - } - - gf_dirent_free (&entries); - free_entries = _gf_false; - gf_log (this->name, GF_LOG_TRACE, - "offset incremented to %d", - (int32_t ) offset); - - } - - ret = syncop_close (fd); - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); - - if (is_directory_empty && IS_ROOT_PATH (loc->path)) { - pump_change_state (this, PUMP_STATE_RUNNING); - gf_log (this->name, GF_LOG_INFO, "Empty source brick. " - "Nothing to be done."); - } - -out: - if (entry_loc.path) - loc_wipe (&entry_loc); - if (free_entries) - gf_dirent_free (&entries); - return 0; -} - -static int -pump_update_resume_path (xlator_t *this) -{ - const char *resume_path = NULL; - - resume_path = pump_get_resume_path (this); - - if (resume_path) { - gf_log (this->name, GF_LOG_DEBUG, - "Found a path to resume from: %s", - resume_path); - - }else { - gf_log (this->name, GF_LOG_DEBUG, - "Did not find a path=> setting to '/'"); - pump_set_resume_path (this, "/"); - } - - pump_change_state (this, PUMP_STATE_RESUME); - - return 0; -} - -static int32_t -pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_private_t *priv = NULL; - loc_t loc = {0}; - int i = 0; - int ret = 0; - int source = 0; - int sink = 1; - - priv = this->private; - - afr_build_root_loc (this, &loc); - - ret = syncop_removexattr (priv->children[source], &loc, - PUMP_PATH); - - ret = syncop_removexattr (priv->children[sink], &loc, - PUMP_SINK_COMPLETE); - - for (i = 0; i < priv->child_count; i++) { - ret = syncop_removexattr (priv->children[i], &loc, - PUMP_SOURCE_COMPLETE); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, "removexattr " - "failed with %s", strerror (errno)); - } - - loc_wipe (&loc); - return pump_command_reply (frame, this); -} - -static int -pump_complete_migration (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - dict_t *dict = NULL; - pump_state_t state; - loc_t loc = {0}; - int dict_ret = 0; - int ret = -1; - - priv = this->private; - pump_priv = priv->pump_private; - - GF_ASSERT (priv->root_inode); - - afr_build_root_loc (this, &loc); - - dict = dict_new (); - - state = pump_get_state (); - if (state == PUMP_STATE_RUNNING) { - gf_log (this->name, GF_LOG_DEBUG, - "Pump finished pumping"); - - pump_priv->pump_finished = _gf_true; - - dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon"); - if (dict_ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set the key %s", - loc.path, PUMP_SOURCE_COMPLETE); - - ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setxattr failed - while notifying source complete"); - } - dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon"); - if (dict_ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set the key %s", - loc.path, PUMP_SINK_COMPLETE); - - ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setxattr failed - while notifying sink complete"); - } - - pump_save_path (this, "/"); - - } else if (state == PUMP_STATE_ABORT) { - gf_log (this->name, GF_LOG_DEBUG, "Starting cleanup " - "of pump internal xattrs"); - call_resume (pump_priv->cleaner); - } - - loc_wipe (&loc); - return 0; -} - -static int -pump_lookup_sink (loc_t *loc) -{ - xlator_t *this = NULL; - struct iatt iatt, parent; - dict_t *xattr_rsp; - dict_t *xattr_req = NULL; - int ret = 0; - - this = THIS; - - xattr_req = dict_new (); - - ret = afr_set_root_gfid (xattr_req); - if (ret) - goto out; - - ret = syncop_lookup (PUMP_SINK_CHILD (this), loc, - xattr_req, &iatt, &xattr_rsp, &parent); - - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Lookup on sink child failed"); - goto out; - } - -out: - if (xattr_req) - dict_unref (xattr_req); - - return ret; -} - -static int -pump_task (void *data) -{ - xlator_t *this = NULL; - afr_private_t *priv = NULL; - - - loc_t loc = {0}; - struct iatt iatt, parent; - dict_t *xattr_rsp = NULL; - dict_t *xattr_req = NULL; - - int ret = -1; - - this = THIS; - priv = this->private; - - GF_ASSERT (priv->root_inode); - - afr_build_root_loc (this, &loc); - xattr_req = dict_new (); - if (!xattr_req) { - gf_log (this->name, GF_LOG_DEBUG, - "Out of memory"); - ret = -1; - goto out; - } - - afr_set_root_gfid (xattr_req); - ret = syncop_lookup (this, &loc, xattr_req, - &iatt, &xattr_rsp, &parent); - - gf_log (this->name, GF_LOG_TRACE, - "lookup: path=%s gfid=%s", - loc.path, uuid_utoa (loc.inode->gfid)); - - ret = pump_check_and_update_status (this); - if (ret < 0) { - goto out; - } - - pump_update_resume_path (this); - - afr_set_root_gfid (xattr_req); - ret = pump_lookup_sink (&loc); - if (ret) { - pump_update_resume_path (this); - goto out; - } - - gf_pump_traverse_directory (&loc); - - pump_complete_migration (this); -out: - if (xattr_req) - dict_unref (xattr_req); - - loc_wipe (&loc); - return 0; -} - - -static int -pump_task_completion (int ret, call_frame_t *sync_frame, void *data) -{ - xlator_t *this = NULL; - afr_private_t *priv = NULL; - - this = THIS; - - priv = this->private; - - inode_unref (priv->root_inode); - STACK_DESTROY (sync_frame->root); - - gf_log (this->name, GF_LOG_DEBUG, - "Pump xlator exiting"); - return 0; -} - -int -pump_start (call_frame_t *pump_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - int ret = -1; - - priv = this->private; - pump_priv = priv->pump_private; - - afr_set_lk_owner (pump_frame, this, pump_frame->root); - pump_pid = (uint64_t) (unsigned long)pump_frame->root; - - ret = synctask_new (pump_priv->env, pump_task, - pump_task_completion, - pump_frame, NULL); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "starting pump failed"); - pump_change_state (this, PUMP_STATE_ABORT); - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "setting pump as started lk_owner: %s %"PRIu64, - lkowner_utoa (&pump_frame->root->lk_owner), pump_pid); - - priv->use_afr_in_pump = 1; -out: - return ret; -} - -static int -pump_start_synctask (xlator_t *this) -{ - call_frame_t *frame = NULL; - int ret = 0; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -1; - goto out; - } - - pump_change_state (this, PUMP_STATE_RUNNING); - - ret = pump_start (frame, this); - -out: - return ret; -} - -int32_t -pump_cmd_start_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, dict_t *xdata) - -{ - call_frame_t *prev = NULL; - afr_local_t *local = NULL; - int ret = 0; - - local = frame->local; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not initiate destination " - "brick connect"); - ret = op_ret; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Successfully initiated destination " - "brick connect"); - - pump_mark_start_pending (this); - - /* send the PARENT_UP as pump is ready now */ - prev = cookie; - if (prev && prev->this) - prev->this->notify (prev->this, GF_EVENT_PARENT_UP, this); - -out: - local->op_ret = ret; - pump_command_reply (frame, this); - - return 0; -} - -static int -pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *dict = NULL; - data_t *data = NULL; - char *clnt_cmd = NULL; - loc_t loc = {0}; - - int ret = 0; - - priv = this->private; - local = frame->local; - - GF_ASSERT (priv->root_inode); - - afr_build_root_loc (this, &loc); - - data = data_ref (dict_get (local->dict, RB_PUMP_CMD_START)); - if (!data) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "Could not get destination brick value"); - goto out; - } - - dict = dict_new (); - if (!dict) { - ret = -1; - goto out; - } - - clnt_cmd = GF_CALLOC (1, data->len+1, gf_common_mt_char); - if (!clnt_cmd) { - ret = -1; - goto out; - } - - memcpy (clnt_cmd, data->data, data->len); - clnt_cmd[data->len] = '\0'; - gf_log (this->name, GF_LOG_DEBUG, "Got destination brick %s\n", - clnt_cmd); - - ret = dict_set_dynstr (dict, CLIENT_CMD_CONNECT, clnt_cmd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not inititiate destination brick " - "connect"); - goto out; - } - - STACK_WIND (frame, - pump_cmd_start_setxattr_cbk, - PUMP_SINK_CHILD(this), - PUMP_SINK_CHILD(this)->fops->setxattr, - &loc, - dict, - 0, NULL); - - ret = 0; - -out: - if (dict) - dict_unref (dict); - - if (data) - data_unref (data); - - if (ret && clnt_cmd) - GF_FREE (clnt_cmd); - - loc_wipe (&loc); - return ret; -} - -static int -is_pump_aborted (xlator_t *this) -{ - pump_state_t state; - - state = pump_get_state (); - - return ((state == PUMP_STATE_ABORT)); -} - -int32_t -pump_cmd_start_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict, dict_t *xdata) -{ - afr_local_t *local = NULL; - char *path = NULL; - - pump_state_t state; - int ret = 0; - int need_unwind = 0; - int dict_ret = -1; - - local = frame->local; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "getxattr failed - changing pump " - "state to RUNNING with '/'"); - path = "/"; - ret = op_ret; - } else { - gf_log (this->name, GF_LOG_TRACE, - "getxattr succeeded"); - - dict_ret = dict_get_str (dict, PUMP_PATH, &path); - if (dict_ret < 0) - path = "/"; - } - - state = pump_get_state (); - if ((state == PUMP_STATE_RUNNING) || - (state == PUMP_STATE_RESUME)) { - gf_log (this->name, GF_LOG_ERROR, - "Pump is already started"); - ret = -1; - goto out; - } - - pump_set_resume_path (this, path); - - if (is_pump_aborted (this)) - /* We're re-starting pump afresh */ - ret = pump_initiate_sink_connect (frame, this); - else { - /* We're re-starting pump from a previous - pause */ - gf_log (this->name, GF_LOG_DEBUG, - "about to start synctask"); - ret = pump_start_synctask (this); - need_unwind = 1; - } - -out: - if ((ret < 0) || (need_unwind == 1)) { - local->op_ret = ret; - pump_command_reply (frame, this); - } - return 0; -} - -int -pump_execute_status (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - uint64_t number_files = 0; - - char filename[PATH_MAX]; - char summary[PATH_MAX+256]; - char *dict_str = NULL; - - int32_t op_ret = 0; - int32_t op_errno = 0; - - dict_t *dict = NULL; - int ret = -1; - - priv = this->private; - pump_priv = priv->pump_private; - - LOCK (&pump_priv->resume_path_lock); - { - number_files = pump_priv->number_files_pumped; - strncpy (filename, pump_priv->current_file, PATH_MAX); - } - UNLOCK (&pump_priv->resume_path_lock); - - dict_str = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char); - if (!dict_str) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (pump_priv->pump_finished) { - snprintf (summary, PATH_MAX+256, - "no_of_files=%"PRIu64, number_files); - } else { - snprintf (summary, PATH_MAX+256, - "no_of_files=%"PRIu64":current_file=%s", - number_files, filename); - } - snprintf (dict_str, PATH_MAX+256, "status=%d:%s", - (pump_priv->pump_finished)?1:0, summary); - - dict = dict_new (); - - ret = dict_set_dynstr (dict, RB_PUMP_CMD_STATUS, dict_str); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_dynstr returned negative value"); - } else { - dict_str = NULL; - } - - op_ret = 0; - -out: - - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); - - if (dict) - dict_unref (dict); - - GF_FREE (dict_str); - - return 0; -} - -int -pump_execute_pause (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - pump_change_state (this, PUMP_STATE_PAUSE); - - local->op_ret = 0; - pump_command_reply (frame, this); - - return 0; -} - -int -pump_execute_start (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = 0; - loc_t loc = {0}; - - priv = this->private; - local = frame->local; - - if (!priv->root_inode) { - gf_log (this->name, GF_LOG_ERROR, - "Pump xlator cannot be started without an initial " - "lookup"); - ret = -1; - goto out; - } - - GF_ASSERT (priv->root_inode); - - afr_build_root_loc (this, &loc); - - STACK_WIND (frame, - pump_cmd_start_getxattr_cbk, - PUMP_SOURCE_CHILD(this), - PUMP_SOURCE_CHILD(this)->fops->getxattr, - &loc, - PUMP_PATH, NULL); - - ret = 0; - -out: - if (ret < 0) { - local->op_ret = ret; - pump_command_reply (frame, this); - } - - loc_wipe (&loc); - return 0; -} - -static int -pump_cleanup_helper (void *data) { - call_frame_t *frame = data; - - pump_xattr_cleaner (frame, 0, frame->this, 0, 0, NULL); - - return 0; -} - -static int -pump_cleanup_done (int ret, call_frame_t *sync_frame, void *data) -{ - STACK_DESTROY (sync_frame->root); - - return 0; -} - -int -pump_execute_commit (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - afr_local_t *local = NULL; - call_frame_t *sync_frame = NULL; - int ret = 0; - - priv = this->private; - pump_priv = priv->pump_private; - local = frame->local; - - local->op_ret = 0; - if (pump_priv->pump_finished) { - pump_change_state (this, PUMP_STATE_COMMIT); - sync_frame = create_frame (this, this->ctx->pool); - ret = synctask_new (pump_priv->env, pump_cleanup_helper, - pump_cleanup_done, sync_frame, frame); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "Couldn't create " - "synctask for cleaning up xattrs."); - } - - } else { - gf_log (this->name, GF_LOG_ERROR, "Commit can't proceed. " - "Migration in progress"); - local->op_ret = -1; - local->op_errno = EINPROGRESS; - pump_command_reply (frame, this); - } - - return 0; -} -int -pump_execute_abort (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - afr_local_t *local = NULL; - call_frame_t *sync_frame = NULL; - int ret = 0; - - priv = this->private; - pump_priv = priv->pump_private; - local = frame->local; - - pump_change_state (this, PUMP_STATE_ABORT); - - LOCK (&pump_priv->resume_path_lock); - { - pump_priv->number_files_pumped = 0; - pump_priv->current_file[0] = '\0'; - } - UNLOCK (&pump_priv->resume_path_lock); - - local->op_ret = 0; - if (pump_priv->pump_finished) { - sync_frame = create_frame (this, this->ctx->pool); - ret = synctask_new (pump_priv->env, pump_cleanup_helper, - pump_cleanup_done, sync_frame, frame); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "Couldn't create " - "synctask for cleaning up xattrs."); - } - - } else { - pump_priv->cleaner = fop_setxattr_cbk_stub (frame, - pump_xattr_cleaner, - 0, 0, NULL); - } - - return 0; -} - -gf_boolean_t -pump_command_status (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, RB_PUMP_CMD_STATUS, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump status command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - status"); - ret = _gf_true; - -out: - return ret; - -} - -gf_boolean_t -pump_command_pause (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, RB_PUMP_CMD_PAUSE, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump pause command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - pause"); - ret = _gf_true; - -out: - return ret; - -} - -gf_boolean_t -pump_command_commit (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, RB_PUMP_CMD_COMMIT, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump commit command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - commit"); - ret = _gf_true; - -out: - return ret; - -} - -gf_boolean_t -pump_command_abort (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, RB_PUMP_CMD_ABORT, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump abort command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - abort"); - ret = _gf_true; - -out: - return ret; - -} - -gf_boolean_t -pump_command_start (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, RB_PUMP_CMD_START, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump start command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - start"); - ret = _gf_true; - -out: - return ret; - -} - -struct _xattr_key { - char *key; - struct list_head list; -}; - -static int -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) -{ - struct list_head * list = data; - struct _xattr_key * xkey = NULL; - - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - - xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); - if (!xkey) - return -1; - - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); - - list_add_tail (&xkey->list, list); - } - return 0; -} - -static void -__filter_xattrs (dict_t *dict) -{ - struct list_head keys; - - struct _xattr_key *key; - struct _xattr_key *tmp; - - INIT_LIST_HEAD (&keys); - - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); - - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); - - list_del_init (&key->list); - - GF_FREE (key); - } -} - -int32_t -pump_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - - priv = this->private; - children = priv->children; - - local = frame->local; - - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, NULL); - } - -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); - - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); - } - - return 0; -} - -int32_t -pump_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) -{ - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_getxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->getxattr, - loc, name, xdata); - return 0; - } - - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - if (name) { - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - - op_errno = ENODATA; - goto out; - } - - if (!strcmp (name, RB_PUMP_CMD_STATUS)) { - gf_log (this->name, GF_LOG_DEBUG, - "Hit pump command - status"); - pump_execute_status (frame, this); - ret = 0; - goto out; - } - } - - local->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*local->fresh_children), - gf_afr_mt_int32_t); - if (!local->fresh_children) { - ret = -1; - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); - - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->getxattr, - loc, name, xdata); - - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); - return 0; -} - -static int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno, NULL); - } - return 0; -} - -static int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} - -static int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags, NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -static int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int32_t -pump_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, dict_t *xdata) -{ - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); - return 0; -} - -int -pump_command_reply (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - if (local->op_ret < 0) - gf_log (this->name, GF_LOG_INFO, - "Command failed"); - else - gf_log (this->name, GF_LOG_INFO, - "Command succeeded"); - - AFR_STACK_UNWIND (setxattr, - frame, - local->op_ret, - local->op_errno, NULL); - - return 0; -} - -int -pump_parse_command (call_frame_t *frame, xlator_t *this, - afr_local_t *local, dict_t *dict) -{ - - int ret = -1; - - if (pump_command_start (this, dict)) { - frame->local = local; - local->dict = dict_ref (dict); - ret = pump_execute_start (frame, this); - - } else if (pump_command_pause (this, dict)) { - frame->local = local; - local->dict = dict_ref (dict); - ret = pump_execute_pause (frame, this); - - } else if (pump_command_abort (this, dict)) { - frame->local = local; - local->dict = dict_ref (dict); - ret = pump_execute_abort (frame, this); - - } else if (pump_command_commit (this, dict)) { - frame->local = local; - local->dict = dict_ref (dict); - ret = pump_execute_commit (frame, this); - } - return ret; -} - -int -pump_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, - op_errno, out); - - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_setxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->setxattr, - loc, dict, flags, xdata); - return 0; - } - - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - afr_local_cleanup (local, this); - goto out; - } - - ret = pump_parse_command (frame, this, - local, dict); - if (ret >= 0) { - ret = 0; - goto out; - } - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - ret = -1; - afr_local_cleanup (local, this); - goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; - - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; - - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; - - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; - - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - - ret = 0; -out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - } - - return 0; -} - -/* Defaults */ -static int32_t -pump_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, - xattr_req); - return 0; - } - - afr_lookup (frame, this, loc, xattr_req); - return 0; -} - - -static int32_t -pump_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, - offset, xdata); - return 0; - } - - afr_truncate (frame, this, loc, offset, xdata); - return 0; -} - - -static int32_t -pump_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_ftruncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, - offset, xdata); - return 0; - } - - afr_ftruncate (frame, this, fd, offset, xdata); - return 0; -} - - - - -int -pump_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_mknod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, umask, xdata); - return 0; - } - afr_mknod (frame, this, loc, mode, rdev, umask, xdata); - return 0; - -} - - - -int -pump_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_mkdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, - loc, mode, umask, xdata); - return 0; - } - afr_mkdir (frame, this, loc, mode, umask, xdata); - return 0; - -} - - -static int32_t -pump_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_unlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - loc, xflag, xdata); - return 0; - } - afr_unlink (frame, this, loc, xflag, xdata); - return 0; - -} - - -static int -pump_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, dict_t *xdata) -{ - afr_private_t *priv = NULL; - - priv = this->private; - - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_rmdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, - loc, flags, xdata); - return 0; - } - - afr_rmdir (frame, this, loc, flags, xdata); - return 0; - -} - - - -int -pump_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_symlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, - linkpath, loc, umask, xdata); - return 0; - } - afr_symlink (frame, this, linkpath, loc, umask, xdata); - return 0; - -} - - -static int32_t -pump_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_rename_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, - oldloc, newloc, xdata); - return 0; - } - afr_rename (frame, this, oldloc, newloc, xdata); - return 0; - -} - - -static int32_t -pump_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_link_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, - oldloc, newloc, xdata); - return 0; - } - afr_link (frame, this, oldloc, newloc, xdata); - return 0; - -} - - -static int32_t -pump_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, umask, fd, xdata); - return 0; - } - afr_create (frame, this, loc, flags, mode, umask, fd, xdata); - return 0; - -} - - -static int32_t -pump_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, fd_t *fd, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_open_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, - loc, flags, fd, xdata); - return 0; - } - afr_open (frame, this, loc, flags, fd, xdata); - return 0; - -} - - -static int32_t -pump_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, uint32_t flags, - struct iobref *iobref, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, - vector, - count, - off, flags, - iobref, xdata); - return 0; - } - - afr_writev (frame, this, fd, vector, count, off, flags, iobref, xdata); - return 0; -} - - -static int32_t -pump_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_flush_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd, xdata); - return 0; - } - afr_flush (frame, this, fd, xdata); - return 0; - -} - - -static int32_t -pump_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_fsync_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, - fd, - flags, xdata); - return 0; - } - afr_fsync (frame, this, fd, flags, xdata); - return 0; - -} - - -static int32_t -pump_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_opendir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, - loc, fd, xdata); - return 0; - } - afr_opendir (frame, this, loc, fd, xdata); - return 0; - -} - - -static int32_t -pump_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_fsyncdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsyncdir, - fd, - flags, xdata); - return 0; - } - afr_fsyncdir (frame, this, fd, flags, xdata); - return 0; - -} - - -static int32_t -pump_xattrop (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - gf_xattrop_flags_t flags, - dict_t *dict, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_xattrop_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, - loc, - flags, - dict, xdata); - return 0; - } - afr_xattrop (frame, this, loc, flags, dict, xdata); - return 0; - -} - -static int32_t -pump_fxattrop (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - gf_xattrop_flags_t flags, - dict_t *dict, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_fxattrop_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fxattrop, - fd, - flags, - dict, xdata); - return 0; - } - afr_fxattrop (frame, this, fd, flags, dict, xdata); - return 0; - -} - - -static int32_t -pump_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name, dict_t *xdata) -{ - afr_private_t *priv = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (this, out); - - GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.pump*", - name, op_errno, out); - - op_errno = 0; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_removexattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, - loc, - name, xdata); - return 0; - } - afr_removexattr (frame, this, loc, name, xdata); - - out: - if (op_errno) - AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); - return 0; - -} - - - -static int32_t -pump_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_readdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir, - fd, size, off, xdata); - return 0; - } - afr_readdir (frame, this, fd, size, off, xdata); - return 0; - -} - - -static int32_t -pump_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t off, dict_t *dict) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_readdirp_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, - fd, size, off, dict); - return 0; - } - afr_readdirp (frame, this, fd, size, off, dict); - return 0; - -} - - - -static int32_t -pump_releasedir (xlator_t *this, - fd_t *fd) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (priv->use_afr_in_pump) - afr_releasedir (this, fd); - return 0; - -} - -static int32_t -pump_release (xlator_t *this, - fd_t *fd) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (priv->use_afr_in_pump) - afr_release (this, fd); - return 0; - -} - -static int32_t -pump_forget (xlator_t *this, inode_t *inode) -{ - afr_private_t *priv = NULL; - - priv = this->private; - if (priv->use_afr_in_pump) - afr_forget (this, inode); - - return 0; -} - -static int32_t -pump_setattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct iatt *stbuf, - int32_t valid, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_setattr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid, xdata); - return 0; - } - afr_setattr (frame, this, loc, stbuf, valid, xdata); - return 0; - -} - - -static int32_t -pump_fsetattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iatt *stbuf, - int32_t valid, dict_t *xdata) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_fsetattr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetattr, - fd, stbuf, valid, xdata); - return 0; - } - afr_fsetattr (frame, this, fd, stbuf, valid, xdata); - return 0; - -} - - -/* End of defaults */ - - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); - - if (ret != 0) { - gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -static int -is_xlator_pump_sink (xlator_t *child) -{ - return (child == PUMP_SINK_CHILD(THIS)); -} - -static int -is_xlator_pump_source (xlator_t *child) -{ - return (child == PUMP_SOURCE_CHILD(THIS)); -} - -int32_t -notify (xlator_t *this, int32_t event, - void *data, ...) -{ - int ret = -1; - xlator_t *child_xl = NULL; - - child_xl = (xlator_t *) data; - - ret = afr_notify (this, event, data, NULL); - - switch (event) { - case GF_EVENT_CHILD_DOWN: - if (is_xlator_pump_source (child_xl)) - pump_change_state (this, PUMP_STATE_ABORT); - break; - - case GF_EVENT_CHILD_UP: - if (is_xlator_pump_sink (child_xl)) - if (is_pump_start_pending (this)) { - gf_log (this->name, GF_LOG_DEBUG, - "about to start synctask"); - ret = pump_start_synctask (this); - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "Could not start pump " - "synctask"); - else - pump_remove_start_pending (this); - } - } - - return ret; -} - -int32_t -init (xlator_t *this) -{ - afr_private_t * priv = NULL; - pump_private_t *pump_priv = NULL; - int child_count = 0; - xlator_list_t * trav = NULL; - int i = 0; - int ret = -1; - GF_UNUSED int op_errno = 0; - - int source_child = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "pump translator needs a source and sink" - "subvolumes defined."); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling."); - } - - this->private = GF_CALLOC (1, sizeof (afr_private_t), - gf_afr_mt_afr_private_t); - if (!this->private) - goto out; - - priv = this->private; - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - //lock recovery is not done in afr - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); - - child_count = xlator_subvolume_count (this); - if (child_count != 2) { - gf_log (this->name, GF_LOG_ERROR, - "There should be exactly 2 children - one source " - "and one sink"); - return -1; - } - priv->child_count = child_count; - - priv->read_child = source_child; - priv->favorite_child = source_child; - priv->background_self_heal_count = 0; - - priv->data_self_heal = "on"; - priv->metadata_self_heal = 1; - priv->entry_self_heal = 1; - - priv->data_self_heal_window_size = 16; - - priv->data_change_log = 1; - priv->metadata_change_log = 1; - priv->entry_change_log = 1; - priv->use_afr_in_pump = 1; - priv->sh_readdir_size = 65536; - - /* Locking options */ - - /* Lock server count infact does not matter. Locks are held - on all subvolumes, in this case being the source - and the sink. - */ - - priv->strict_readdir = _gf_false; - - priv->wait_count = 1; - priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, - gf_afr_mt_char); - if (!priv->child_up) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } - - priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, - gf_afr_mt_xlator_t); - if (!priv->children) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } - - priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), - child_count, - gf_afr_mt_char); - if (!priv->pending_key) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } - - trav = this->children; - i = 0; - while (i < child_count) { - priv->children[i] = trav->xlator; - - ret = gf_asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, - trav->xlator->name); - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); - op_errno = ENOMEM; - goto out; - } - - trav = trav->next; - i++; - } - - ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); - if (-1 == ret) { - op_errno = ENOMEM; - goto out; - } - - priv->first_lookup = 1; - priv->root_inode = NULL; - - priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), - gf_afr_mt_int32_t); - if (!priv->last_event) { - ret = -ENOMEM; - goto out; - } - - pump_priv = GF_CALLOC (1, sizeof (*pump_priv), - gf_afr_mt_pump_priv); - if (!pump_priv) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto out; - } - - LOCK_INIT (&pump_priv->resume_path_lock); - LOCK_INIT (&pump_priv->pump_state_lock); - - pump_priv->resume_path = GF_CALLOC (1, PATH_MAX, - gf_afr_mt_char); - if (!pump_priv->resume_path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -1; - goto out; - } - - pump_priv->env = this->ctx->env; - if (!pump_priv->env) { - gf_log (this->name, GF_LOG_ERROR, - "Could not create new sync-environment"); - ret = -1; - goto out; - } - - /* keep more local here as we may need them for self-heal etc */ - this->local_pool = mem_pool_new (afr_local_t, 128); - if (!this->local_pool) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto out; - } - - priv->pump_private = pump_priv; - - pump_change_state (this, PUMP_STATE_ABORT); - - ret = 0; -out: - return ret; -} - -int -fini (xlator_t *this) -{ - afr_private_t * priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - this->private = NULL; - if (!priv) - goto out; - - pump_priv = priv->pump_private; - if (!pump_priv) - goto afr_priv; - - GF_FREE (pump_priv->resume_path); - LOCK_DESTROY (&pump_priv->resume_path_lock); - LOCK_DESTROY (&pump_priv->pump_state_lock); - GF_FREE (pump_priv); -afr_priv: - afr_priv_destroy (priv); -out: - return 0; -} - - -struct xlator_fops fops = { - .lookup = pump_lookup, - .open = pump_open, - .flush = pump_flush, - .fsync = pump_fsync, - .fsyncdir = pump_fsyncdir, - .xattrop = pump_xattrop, - .fxattrop = pump_fxattrop, - .getxattr = pump_getxattr, - - /* inode write */ - .writev = pump_writev, - .truncate = pump_truncate, - .ftruncate = pump_ftruncate, - .setxattr = pump_setxattr, - .setattr = pump_setattr, - .fsetattr = pump_fsetattr, - .removexattr = pump_removexattr, - - /* dir read */ - .opendir = pump_opendir, - .readdir = pump_readdir, - .readdirp = pump_readdirp, - - /* dir write */ - .create = pump_create, - .mknod = pump_mknod, - .mkdir = pump_mkdir, - .unlink = pump_unlink, - .rmdir = pump_rmdir, - .link = pump_link, - .symlink = pump_symlink, - .rename = pump_rename, -}; - -struct xlator_dumpops dumpops = { - .priv = afr_priv_dump, -}; - - -struct xlator_cbks cbks = { - .release = pump_release, - .releasedir = pump_releasedir, - .forget = pump_forget, -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h deleted file mode 100644 index bc4c31a78a5..00000000000 --- a/xlators/cluster/afr/src/pump.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __PUMP_H__ -#define __PUMP_H__ - -#include "syncop.h" - -/* FIXME: Needs to be defined in a common file */ -#define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect" -#define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect" - -#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete" -#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete" - -#define PUMP_PATH "trusted.glusterfs.pump-path" - -#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator) -#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator) - -typedef enum { - PUMP_STATE_RUNNING, /* Pump is running and migrating files */ - PUMP_STATE_RESUME, /* Pump is resuming from a previous pause */ - PUMP_STATE_PAUSE, /* Pump is paused */ - PUMP_STATE_ABORT, /* Pump is aborted */ - PUMP_STATE_COMMIT, /* Pump is commited */ -} pump_state_t; - -typedef struct _pump_private { - struct syncenv *env; /* The env pointer to the pump synctask */ - char *resume_path; /* path to resume from the last pause */ - gf_lock_t resume_path_lock; /* Synchronize resume_path changes */ - gf_lock_t pump_state_lock; /* Synchronize pump_state changes */ - pump_state_t pump_state; /* State of pump */ - char current_file[PATH_MAX]; /* Current file being pumped */ - uint64_t number_files_pumped; /* Number of files pumped */ - gf_boolean_t pump_finished; /* Boolean to indicate pump termination */ - char pump_start_pending; /* Boolean to mark start pending until - CHILD_UP */ - call_stub_t *cleaner; -} pump_private_t; - -void -build_root_loc (inode_t *inode, loc_t *loc); -int pump_start (call_frame_t *frame, xlator_t *this); - -gf_boolean_t -pump_command_start (xlator_t *this, dict_t *dict); - -int -pump_execute_start (call_frame_t *frame, xlator_t *this); - -gf_boolean_t -pump_command_pause (xlator_t *this, dict_t *dict); - -int -pump_execute_pause (call_frame_t *frame, xlator_t *this); - -gf_boolean_t -pump_command_abort (xlator_t *this, dict_t *dict); - -int -pump_execute_abort (call_frame_t *frame, xlator_t *this); - -gf_boolean_t -pump_command_status (xlator_t *this, dict_t *dict); - -int -pump_execute_status (call_frame_t *frame, xlator_t *this); - -#endif /* __PUMP_H__ */ |
