diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 3034 |
1 files changed, 643 insertions, 2391 deletions
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 270364ff9..c724eb2ae 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -28,2514 +19,775 @@ #define _CONFIG_H #include "config.h" #endif +#include "afr-common.c" -#include "glusterfs.h" -#include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "fd.h" - -#include "afr-inode-read.h" -#include "afr-inode-write.h" -#include "afr-dir-read.h" -#include "afr-dir-write.h" -#include "afr-transaction.h" - -#include "afr-self-heal.h" - - -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode) -{ - int ret = 0; - - uint64_t ctx = 0; - uint64_t split_brain = 0; - - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); - - if (ret < 0) - goto unlock; - - split_brain = ctx & 0xFFFFFFFF00000000ULL; - } -unlock: - UNLOCK (&inode->lock); - - return split_brain; -} +#define SHD_INODE_LRU_LIMIT 2048 +#define AFR_EH_HEALED_LIMIT 1024 +#define AFR_EH_HEAL_FAIL_LIMIT 1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 +struct volume_options options[]; -void -afr_set_split_brain (xlator_t *this, inode_t *inode, int32_t split_brain) +int32_t +notify (xlator_t *this, int32_t event, + void *data, ...) { - uint64_t ctx = 0; - int ret = 0; + int ret = -1; + va_list ap; + void *data2 = NULL; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + va_start (ap, data); + data2 = va_arg (ap, dict_t*); + va_end (ap); + ret = afr_notify (this, event, data, data2); - if (ret < 0) { - ctx = 0; - } + return ret; +} - ctx = (0x00000000FFFFFFFFULL & ctx) - | (split_brain & 0xFFFFFFFF00000000ULL); +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; - __inode_ctx_put (inode, this, ctx); - } - UNLOCK (&inode->lock); -} + if (!this) + return ret; + ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); -uint64_t -afr_read_child (xlator_t *this, inode_t *inode) -{ - int ret = 0; - - uint64_t ctx = 0; - uint64_t read_child = 0; - - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); - - if (ret < 0) - goto unlock; - - read_child = ctx & 0x00000000FFFFFFFFULL; + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; } -unlock: - UNLOCK (&inode->lock); - return read_child; + return ret; } -void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child) +int +xlator_subvolume_index (xlator_t *this, xlator_t *subvol) { - uint64_t ctx = 0; - int ret = 0; + int index = -1; + int i = 0; + xlator_list_t *list = NULL; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + list = this->children; - if (ret < 0) { - ctx = 0; + while (list) { + if (subvol == list->xlator || + strcmp (subvol->name, list->xlator->name) == 0) { + index = i; + break; } - - ctx = (0xFFFFFFFF00000000ULL & ctx) - | (0x00000000FFFFFFFFULL & read_child); - - __inode_ctx_put (inode, this, ctx); + list = list->next; + i++; } - UNLOCK (&inode->lock); -} - -/** - * afr_local_cleanup - cleanup everything in frame->local - */ - -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - - sh = &local->self_heal; - priv = this->private; - - if (sh->buf) - FREE (sh->buf); - - if (sh->xattr) { - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } - FREE (sh->xattr); - } - - if (sh->child_errno) - FREE (sh->child_errno); - - if (sh->pending_matrix) { - for (i = 0; i < priv->child_count; i++) { - FREE (sh->pending_matrix[i]); - } - FREE (sh->pending_matrix); - } - - if (sh->delta_matrix) { - for (i = 0; i < priv->child_count; i++) { - FREE (sh->delta_matrix[i]); - } - FREE (sh->delta_matrix); - } - - if (sh->sources) - FREE (sh->sources); - - if (sh->success) - FREE (sh->success); - - if (sh->healing_fd) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } - - loc_wipe (&sh->parent_loc); + return index; } - -void -afr_local_cleanup (afr_local_t *local, xlator_t *this) +void +fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype) { - int i; - afr_private_t * priv = NULL; - - if (!local) - return; - - afr_local_sh_cleanup (local, this); - - FREE (local->child_errno); - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (local->pending && local->pending[i]) - FREE (local->pending[i]); + if (priv->quorum_count && strcmp(qtype,"fixed")) { + gf_log(this->name,GF_LOG_WARNING, + "quorum-type %s overriding quorum-count %u", + qtype, priv->quorum_count); + } + if (!strcmp(qtype,"none")) { + priv->quorum_count = 0; + } + else if (!strcmp(qtype,"auto")) { + priv->quorum_count = AFR_QUORUM_AUTO; } - - FREE (local->pending); - - loc_wipe (&local->loc); - loc_wipe (&local->newloc); - - FREE (local->transaction.locked_nodes); - FREE (local->transaction.child_errno); - - FREE (local->transaction.basename); - FREE (local->transaction.new_basename); - - loc_wipe (&local->transaction.parent_loc); - loc_wipe (&local->transaction.new_parent_loc); - - if (local->fd) - fd_unref (local->fd); - - if (local->xattr_req) - dict_unref (local->xattr_req); - - FREE (local->child_up); - - { /* lookup */ - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); - } - - { /* getxattr */ - if (local->cont.getxattr.name) - FREE (local->cont.getxattr.name); - } - - { /* lk */ - if (local->cont.lk.locked_nodes) - FREE (local->cont.lk.locked_nodes); - } - - { /* checksum */ - if (local->cont.checksum.file_checksum) - FREE (local->cont.checksum.file_checksum); - if (local->cont.checksum.dir_checksum) - FREE (local->cont.checksum.dir_checksum); - } - - { /* create */ - if (local->cont.create.fd) - fd_unref (local->cont.create.fd); - } - - { /* writev */ - FREE (local->cont.writev.vector); - } - - { /* setxattr */ - if (local->cont.setxattr.dict) - dict_unref (local->cont.setxattr.dict); - } - - { /* removexattr */ - FREE (local->cont.removexattr.name); - } - - { /* symlink */ - FREE (local->cont.symlink.linkpath); - } } - int -afr_frame_return (call_frame_t *frame) +reconfigure (xlator_t *this, dict_t *options) { - afr_local_t *local = NULL; - int call_count = 0; + afr_private_t *priv = NULL; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + int ret = -1; + int index = -1; + char *qtype = NULL; - local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + GF_OPTION_RECONF ("background-self-heal-count", + priv->background_self_heal_count, options, uint32, + out); - return call_count; -} + GF_OPTION_RECONF ("metadata-self-heal", + priv->metadata_self_heal, options, bool, out); -/** - * first_up_child - return the index of the first child that is up - */ + GF_OPTION_RECONF ("data-self-heal", priv->data_self_heal, options, str, + out); -int -afr_first_up_child (afr_private_t *priv) -{ - xlator_t ** children = NULL; - int ret = -1; - int i = 0; - - LOCK (&priv->lock); - { - children = priv->children; - for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i]) { - ret = i; - break; - } - } - } - UNLOCK (&priv->lock); - - return ret; -} + GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options, + bool, out); + GF_OPTION_RECONF ("strict-readdir", priv->strict_readdir, options, bool, + out); -/** - * up_children_count - return the number of children that are up - */ + GF_OPTION_RECONF ("data-self-heal-window-size", + priv->data_self_heal_window_size, options, + uint32, out); -int -afr_up_children_count (int child_count, unsigned char *child_up) -{ - int i = 0; - int ret = 0; + GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options, + bool, out); - for (i = 0; i < child_count; i++) - if (child_up[i]) - ret++; - return ret; -} + GF_OPTION_RECONF ("metadata-change-log", + priv->metadata_change_log, options, bool, out); + GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options, + bool, out); -int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) -{ - int ret = 0; - int i; + GF_OPTION_RECONF ("data-self-heal-algorithm", + priv->data_self_heal_algorithm, options, str, out); - for (i = 0; i < child_count; i++) - if (locked_nodes[i]) - ret++; + GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, bool, out); - return ret; -} + GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); + GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, + options, uint32, out); -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index) -{ - ino64_t scaled_ino = -1; + if (read_subvol) { + index = xlator_subvolume_index (this, read_subvol); + if (index == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + read_subvol->name); + goto out; + } + priv->read_child = index; + } - if (ino == ((uint64_t) -1)) { - scaled_ino = ((uint64_t) -1); - goto out; - } + GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out); - scaled_ino = (ino * child_count) + child_index; + if (read_subvol_index >-1) { + index=read_subvol_index; + if (index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + index); + goto out; + } + priv->read_child = index; + } + GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); + GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, + uint32, out); + fix_quorum_options(this,priv,qtype); + GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, + int32, out); + + GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, + uint32, out); + + GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, + options, size, out); + /* Reset this so we re-discover in case the topology changed. */ + GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, + bool, out); + GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, + bool, out); + priv->did_discovery = _gf_false; + + ret = 0; out: - return scaled_ino; -} - - -int -afr_deitransform_orig (ino64_t ino, int child_count) -{ - int index = -1; - - index = ino % child_count; - - return index; -} - - -int -afr_deitransform (ino64_t ino, int child_count) -{ - return 0; -} - - -int -afr_self_heal_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - int ret = -1; - - local = frame->local; - - if (local->govinda_gOvinda) { - afr_set_split_brain (this, local->cont.lookup.inode, 1); - - if (ret < 0) { - local->op_ret = -1; - local->op_errno = -ret; - } - } else { - afr_set_split_brain (this, local->cont.lookup.inode, 0); - } - - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr); + return ret; - return 0; } -int -afr_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct stat * lookup_buf = NULL; - int call_count = -1; - int child_index = -1; - - uint32_t open_fd_count = 0; - int ret = 0; - - child_index = (long) cookie; - priv = this->private; - - LOCK (&frame->lock); - { - local = frame->local; - - lookup_buf = &local->cont.lookup.buf; - - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; - - if (op_errno != ENOTCONN) { - if (local->op_errno != ESTALE) - local->op_errno = op_errno; - } - - if (op_errno == ESTALE) { - /* no matter what other subvolumes return for - * this call, ESTALE _must_ be sent to parent - */ - local->op_ret = -1; - local->op_errno = ESTALE; - } - goto unlock; - } - - if (afr_sh_has_metadata_pending (xattr, child_index, this)) - local->need_metadata_self_heal = 1; - - if (afr_sh_has_entry_pending (xattr, child_index, this)) - local->need_entry_self_heal = 1; - - if (afr_sh_has_data_pending (xattr, child_index, this)) - local->need_data_self_heal = 1; - - ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, - &open_fd_count); - local->open_fd_count += open_fd_count; - - /* in case of revalidate, we need to send stat of the - * child whose stat was sent during the first lookup. - * (so that time stamp does not vary with revalidate. - * in case it is down, stat of the fist success will - * be replied */ - - /* inode number should be preserved across revalidates */ - - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; - - local->cont.lookup.inode = inode; - local->cont.lookup.xattr = dict_ref (xattr); - - *lookup_buf = *buf; - lookup_buf->st_ino = afr_itransform (buf->st_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } - - } else { - if ((local->op_ret == 0) - && (child_index == local->read_child_index)) { - - /* - lookup has succeeded on the read child. - So use its inode number - */ - - local->op_ret = op_ret; - - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); - - local->cont.lookup.inode = inode; - local->cont.lookup.xattr = dict_ref (xattr); - - *lookup_buf = *buf; - lookup_buf->st_ino = afr_itransform (buf->st_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - local->read_child_index); - } - } - - if (FILETYPE_DIFFERS (buf, lookup_buf)) { - /* mismatching filetypes with same name - -- Govinda !! GOvinda !!! - */ - local->govinda_gOvinda = 1; - } - - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->need_metadata_self_heal = 1; - } - - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->need_metadata_self_heal = 1; - } - - if (SIZE_DIFFERS (buf, lookup_buf) - && S_ISREG (buf->st_mode)) { - local->need_data_self_heal = 1; - } - } - - local->success_count++; - } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->op_ret == 0) { - /* KLUDGE: assuming DHT will not itransform in - revalidate */ - if (local->cont.lookup.inode->ino) - lookup_buf->st_ino = - local->cont.lookup.inode->ino; - } - - if (local->success_count && local->enoent_count) { - local->need_metadata_self_heal = 1; - local->need_data_self_heal = 1; - local->need_entry_self_heal = 1; - } - - if (local->success_count) { - /* check for split-brain case in previous lookup */ - if (afr_is_split_brain (this, - local->cont.lookup.inode)) - local->need_data_self_heal = 1; - } - - if ((local->need_metadata_self_heal - || local->need_data_self_heal - || local->need_entry_self_heal) - && (!local->open_fd_count)) { - - if (!local->cont.lookup.inode->st_mode) { - /* fix for RT #602 */ - local->cont.lookup.inode->st_mode = - lookup_buf->st_mode; - } - - afr_self_heal (frame, this, afr_self_heal_cbk); - } else { - AFR_STACK_UNWIND (frame, local->op_ret, - local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr); - } - } - - return 0; -} +static const char *favorite_child_warning_str = "You have specified subvolume '%s' " + "as the 'favorite child'. This means that if a discrepancy in the content " + "or attributes (ownership, permission, etc.) of a file is detected among " + "the subvolumes, the file on '%s' will be considered the definitive " + "version and its contents will OVERWRITE the contents of the file on other " + "subvolumes. All versions of the file except that on '%s' " + "WILL BE LOST."; -int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +int32_t +init (xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - - uint64_t ctx; - - int32_t op_errno = 0; - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - local->op_ret = -1; - - frame->local = local; - - loc_copy (&local->loc, loc); - - ret = inode_ctx_get (loc->inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ - - local->read_child_index = afr_read_child (this, loc->inode); - } else { - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + afr_private_t *priv = NULL; + int child_count = 0; + xlator_list_t *trav = NULL; + int i = 0; + int ret = -1; + GF_UNUSED int op_errno = 0; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + xlator_t *fav_child = NULL; + char *qtype = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "replicate translator needs more than one " + "subvolume defined."); + return -1; } - local->call_count = priv->child_count; - - local->child_up = memdup (priv->child_up, priv->child_count); - local->child_count = afr_up_children_count (priv->child_count, - local->child_up); - - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; - - if (xattr_req == NULL) - local->xattr_req = dict_new (); - else - local->xattr_req = dict_ref (xattr_req); - - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - - /* 3 = data+metadata+entry */ + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Volume is dangling."); } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); - - for (i = 0; i < priv->child_count; i++) { - STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, local->xattr_req); - } - - ret = 0; -out: - if (ret == -1) - AFR_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); - - return 0; -} - - -/* {{{ open */ - -int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) -{ - afr_private_t * priv = NULL; - - int op_ret = 0; - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + this->private = GF_CALLOC (1, sizeof (afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; priv = this->private; + LOCK_INIT (&priv->lock); + LOCK_INIT (&priv->read_child_lock); + //lock recovery is not done in afr + pthread_mutex_init (&priv->mutex, NULL); + INIT_LIST_HEAD (&priv->saved_fds); - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &ctx); - - if (ret == 0) - goto out; + child_count = xlator_subvolume_count (this); - fd_ctx = CALLOC (1, sizeof (afr_fd_ctx_t)); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - - op_ret = -ENOMEM; - goto out; - } + priv->child_count = child_count; - fd_ctx->child_failed = CALLOC (sizeof (*fd_ctx->child_failed), - priv->child_count); - - if (!fd_ctx->child_failed) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); + priv->read_child = -1; - op_ret = -ENOMEM; + GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); + if (read_subvol) { + priv->read_child = xlator_subvolume_index (this, read_subvol); + if (priv->read_child == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + read_subvol->name); goto out; } - - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); - if (ret < 0) { - op_ret = ret; + } + GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out); + if (read_subvol_index > -1) { + if (read_subvol_index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + read_subvol_index); + goto out; } + priv->read_child = read_subvol_index; } -out: - UNLOCK (&fd->lock); + GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); - return ret; -} - - -int -afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) -{ - afr_local_t * local = frame->local; - int ret = 0; - - ret = afr_fd_ctx_set (this, local->fd); + GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); - if (ret < 0) { - local->op_ret = -1; - local->op_errno = -ret; + priv->favorite_child = -1; + GF_OPTION_INIT ("favorite-child", fav_child, xlator, out); + if (fav_child) { + priv->favorite_child = xlator_subvolume_index (this, fav_child); + if (priv->favorite_child == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + fav_child->name); + goto out; + } + gf_log (this->name, GF_LOG_WARNING, + favorite_child_warning_str, fav_child->name, + fav_child->name, fav_child->name); } - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->fd); - return 0; -} - - -int -afr_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int ret = 0; - - int call_count = -1; - - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { - STACK_WIND (frame, afr_open_ftruncate_cbk, - this, this->fops->ftruncate, - fd, 0); - } else { - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", - fd); - - local->op_ret = -1; - local->op_errno = -ret; - } - - AFR_STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - } - } - - return 0; -} - - -int -afr_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, fd_t *fd) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int i = 0; - int ret = -1; - - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - - gf_log (this->name, GF_LOG_WARNING, - "returning EIO, file has to be manually corrected " - "in the backend"); - - op_errno = EIO; - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - call_count = local->call_count; - - local->cont.open.flags = flags; - local->fd = fd_ref (fd); - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - loc, wind_flags, fd); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); - } - - return 0; -} - -/* }}} */ - -/* {{{ flush */ - -int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - int call_count = -1; + GF_OPTION_INIT ("background-self-heal-count", + priv->background_self_heal_count, uint32, out); - local = frame->local; + GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out); - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + GF_OPTION_INIT ("data-self-heal-algorithm", + priv->data_self_heal_algorithm, str, out); - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + GF_OPTION_INIT ("data-self-heal-window-size", + priv->data_self_heal_window_size, uint32, out); - call_count = afr_frame_return (frame); + GF_OPTION_INIT ("metadata-self-heal", priv->metadata_self_heal, bool, + out); - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} - - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int i = 0; - int call_count = -1; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - local->fd); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; + GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); - return 0; -} + GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); -int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, + out); - int ret = -1; + GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out); - int op_ret = -1; - int op_errno = 0; + GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log, + bool, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out); - priv = this->private; + GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); - ALLOC_OR_GOTO (local, afr_local_t, out); + GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); + GF_OPTION_INIT ("quorum-type", qtype, str, out); + GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out); + GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size, + out); + fix_quorum_options(this,priv,qtype); - frame->local = local; + GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); + GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); + GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, + out); - local->op = GF_FOP_FLUSH; - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; + priv->wait_count = 1; - local->fd = fd_ref (fd); - - local->transaction.start = 0; - local->transaction.len = 0; - - afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - - return 0; -} - -/* }}} */ - - -int -afr_release (xlator_t *this, fd_t *fd) -{ - uint64_t ctx; - afr_fd_ctx_t * fd_ctx; - - int ret = 0; - - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) + priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, + gf_afr_mt_char); + if (!priv->child_up) { + ret = -ENOMEM; goto out; - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx) { - if (fd_ctx->child_failed) - FREE (fd_ctx->child_failed); - - FREE (fd_ctx); } - -out: - return 0; -} - - -/* {{{ fsync */ - -int -afr_fsync_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fsync_cbk, - priv->children[i], - priv->children[i]->fops->fsync, - fd, datasync); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - -/* }}} */ - -/* {{{ fsync */ - -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fsync_cbk, - priv->children[i], - priv->children[i]->fops->fsyncdir, - fd, datasync); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - -/* }}} */ - -/* {{{ xattrop */ - -int32_t -afr_xattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); - - return 0; -} - - -int32_t -afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_xattrop_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - loc, optype, xattr); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - -/* }}} */ - -/* {{{ fxattrop */ - -int32_t -afr_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); - - return 0; -} - - -int32_t -afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fxattrop_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - fd, optype, xattr); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - -/* }}} */ - - -int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct flock *flock) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_inodelk_cbk, - priv->children[i], - priv->children[i]->fops->inodelk, - volume, loc, cmd, flock); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - -int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct flock *flock) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_finodelk_cbk, - priv->children[i], - priv->children[i]->fops->finodelk, - volume, fd, cmd, flock); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - -int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_entrylk_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - - -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fentrylk_cbk, - priv->children[i], - priv->children[i]->fops->fentrylk, - volume, fd, basename, cmd, type); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - -int32_t -afr_checksum_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - uint8_t *file_checksum, uint8_t *dir_checksum) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0 && (local->op_ret != 0)) { - local->op_ret = 0; - - local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX); - memcpy (local->cont.checksum.file_checksum, file_checksum, - ZR_FILENAME_MAX); - - local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX); - memcpy (local->cont.checksum.dir_checksum, dir_checksum, - ZR_FILENAME_MAX); - - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->cont.checksum.file_checksum, - local->cont.checksum.dir_checksum); - - return 0; -} - - -int32_t -afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flag) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_checksum_cbk, - priv->children[i], - priv->children[i]->fops->checksum, - loc, flag); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs) -{ - afr_local_t *local = NULL; - - int call_count = 0; - - LOCK (&frame->lock); - { - local = frame->local; - - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } - } - - if (op_ret == -1) - local->op_errno = op_errno; - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf); - - return 0; -} - - -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) -{ - afr_private_t * priv = NULL; - int child_count = 0; - afr_local_t * local = NULL; - int i = 0; - - int ret = -1; - int call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - child_count = priv->child_count; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - call_count = local->call_count; - - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_statfs_cbk, - priv->children[i], - priv->children[i]->fops->statfs, - loc); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - return 0; -} - - -int32_t -afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct flock *lock) -{ - afr_local_t * local = NULL; - - int call_count = -1; - - local = frame->local; - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - lock); - - return 0; -} - - -int32_t -afr_lk_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int i; - int call_count = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, - priv->child_count); - - if (call_count == 0) { - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->cont.lk.flock); - return 0; - } - - local->call_count = call_count; - - local->cont.lk.flock.l_type = F_UNLCK; - - for (i = 0; i < priv->child_count; i++) { - if (local->cont.lk.locked_nodes[i]) { - STACK_WIND (frame, afr_lk_unlock_cbk, - priv->children[i], - priv->children[i]->fops->lk, - local->fd, F_SETLK, - &local->cont.lk.flock); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int32_t -afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct flock *lock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - call_count = --local->call_count; - - if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { - local->op_ret = -1; - local->op_errno = op_errno; - - afr_lk_unlock (frame, this); - return 0; - } - - if (op_ret == 0) { - local->op_ret = 0; - local->op_errno = 0; - local->cont.lk.flock = *lock; - local->cont.lk.locked_nodes[child_index] = 1; - } - - child_index++; - - if (child_index < priv->child_count) { - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->lk, - local->fd, local->cont.lk.cmd, - &local->cont.lk.flock); - } else if (local->op_ret == -1) { - /* all nodes have gone down */ - - AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock); - } else { - /* locking has succeeded on all nodes that are up */ - - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->cont.lk.flock); - } - - return 0; -} - - -int -afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, - struct flock *flock) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int i = 0; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - AFR_LOCAL_INIT (local, priv); - - frame->local = local; - - local->cont.lk.locked_nodes = CALLOC (priv->child_count, - sizeof (*local->cont.lk.locked_nodes)); - - if (!local->cont.lk.locked_nodes) { - gf_log (this->name, GF_LOG_ERROR, "out of memory :("); - op_errno = ENOMEM; - goto out; - } - - local->fd = fd_ref (fd); - local->cont.lk.cmd = cmd; - local->cont.lk.flock = *flock; - - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, - priv->children[i], - priv->children[i]->fops->lk, - fd, cmd, flock); - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - return 0; -} - - -/** - * find_child_index - find the child's index in the array of subvolumes - * @this: AFR - * @child: child - */ - -static int -find_child_index (xlator_t *this, xlator_t *child) -{ - afr_private_t *priv = NULL; - - int i = -1; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if ((xlator_t *) child == priv->children[i]) - break; - } - - return i; -} - - -int32_t -notify (xlator_t *this, int32_t event, - void *data, ...) -{ - afr_private_t * priv = NULL; - unsigned char * child_up = NULL; - - int i = -1; - int up_children = 0; - - priv = this->private; - - if (!priv) - return 0; - - child_up = priv->child_up; - - switch (event) { - case GF_EVENT_CHILD_UP: - i = find_child_index (this, data); - - gf_log (this->name, GF_LOG_NORMAL, - "subvolume %s came up", ((xlator_t *) data)->name); - - child_up[i] = 1; - - /* - if all the children were down, and one child came up, - send notify to parent - */ - - for (i = 0; i < priv->child_count; i++) - if (child_up[i]) - up_children++; - - if (up_children == 1) - default_notify (this, event, data); + for (i = 0; i < child_count; i++) + priv->child_up[i] = -1; /* start with unknown state. + this initialization needed + for afr_notify() to work + reliably + */ + + priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, + gf_afr_mt_xlator_t); + if (!priv->children) { + ret = -ENOMEM; + goto out; + } - break; + priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), + child_count, + gf_afr_mt_char); + if (!priv->pending_key) { + ret = -ENOMEM; + goto out; + } - case GF_EVENT_CHILD_DOWN: - i = find_child_index (this, data); + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; - gf_log (this->name, GF_LOG_NORMAL, - "subvolume %s went down", ((xlator_t *) data)->name); + ret = gf_asprintf (&priv->pending_key[i], "%s.%s", + AFR_XATTR_PREFIX, + trav->xlator->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } - child_up[i] = 0; - - /* - if all children are down, and this was the last to go down, - send notify to parent - */ + trav = trav->next; + i++; + } - for (i = 0; i < priv->child_count; i++) - if (child_up[i]) - up_children++; + ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, + this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } - if (up_children == 0) - default_notify (this, event, data); + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), + gf_afr_mt_int32_t); + if (!priv->last_event) { + ret = -ENOMEM; + goto out; + } - break; + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 512); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } - default: - default_notify (this, event, data); - } + priv->first_lookup = 1; + priv->root_inode = NULL; - return 0; -} + if (!priv->shd.iamshd) { + ret = 0; + goto out; + } + ret = -ENOMEM; + priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, + gf_afr_mt_brick_pos_t); + if (!priv->shd.pos) + goto out; -static const char *favorite_child_warning_str = "You have specified subvolume '%s' " - "as the 'favorite child'. This means that if a discrepancy in the content " - "or attributes (ownership, permission, etc.) of a file is detected among " - "the subvolumes, the file on '%s' will be considered the definitive " - "version and its contents will OVERWRITE the contents of the file on other " - "subvolumes. All versions of the file except that on '%s' " - "WILL BE LOST."; - -static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " - "This means correctness is NO LONGER GUARANTEED in all cases. If two or more " - "applications write to the same region of a file, there is a possibility that " - "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " - "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " - "RESPONSIBLE for inconsistent data. If you are in doubt, set it to a value " - "greater than 0."; - -int32_t -init (xlator_t *this) -{ - afr_private_t * priv = NULL; - int child_count = 0; - xlator_list_t * trav = NULL; - int i = 0; - int ret = -1; - int op_errno = 0; - - char * read_subvol = NULL; - char * fav_child = NULL; - char * self_heal = NULL; - char * change_log = NULL; - - int32_t lock_server_count = 1; - - int fav_ret = -1; - int read_ret = -1; - int dict_ret = -1; - - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "AFR needs more than one child defined"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - ALLOC_OR_GOTO (this->private, afr_private_t, out); - - priv = this->private; - - read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); - priv->read_child = -1; - - fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); - priv->favorite_child = -1; - - /* Default values */ - - priv->data_self_heal = 1; - priv->metadata_self_heal = 1; - priv->entry_self_heal = 1; - - dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->data_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "invalid 'option data-self-heal %s' " - "defaulting to data-self-heal as 'on'", - self_heal); - priv->data_self_heal = 1; - } - } - - dict_ret = dict_get_str (this->options, "metadata-self-heal", - &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "invalid 'option metadata-self-heal %s' " - "defaulting to metadata-self-heal as 'on'", - self_heal); - priv->metadata_self_heal = 1; - } - } - - dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->entry_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "invalid 'option entry-self-heal %s' " - "defaulting to entry-self-heal as 'on'", - self_heal); - priv->entry_self_heal = 1; - } - } - - /* Change log options */ - - priv->data_change_log = 1; - priv->metadata_change_log = 0; - priv->entry_change_log = 1; - - dict_ret = dict_get_str (this->options, "data-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->data_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "invalid 'option data-change-log %s'. " - "defaulting to data-change-log as 'on'", - change_log); - priv->data_change_log = 1; - } - } - - dict_ret = dict_get_str (this->options, "metadata-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, - &priv->metadata_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "invalid 'option metadata-change-log %s'. " - "defaulting to metadata-change-log as 'off'", - change_log); - priv->metadata_change_log = 0; - } - } - - dict_ret = dict_get_str (this->options, "entry-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->entry_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "invalid 'option entry-change-log %s'. " - "defaulting to entry-change-log as 'on'", - change_log); - priv->entry_change_log = 1; - } - } - - /* Locking options */ - - priv->data_lock_server_count = 1; - priv->metadata_lock_server_count = 0; - priv->entry_lock_server_count = 1; - - dict_ret = dict_get_int32 (this->options, "data-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setting data lock server count to %d", - lock_server_count); - - if (lock_server_count == 0) - gf_log (this->name, GF_LOG_WARNING, - no_lock_servers_warning_str); - - priv->data_lock_server_count = lock_server_count; - } - - - dict_ret = dict_get_int32 (this->options, - "metadata-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setting metadata lock server count to %d", - lock_server_count); - priv->metadata_lock_server_count = lock_server_count; - } - - - dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setting entry lock server count to %d", - lock_server_count); - - priv->entry_lock_server_count = lock_server_count; - } - - trav = this->children; - while (trav) { - if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { - gf_log (this->name, GF_LOG_DEBUG, - "subvolume '%s' specified as read child", - trav->xlator->name); - - priv->read_child = child_count; - } - - if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { - gf_log (this->name, GF_LOG_WARNING, - favorite_child_warning_str, trav->xlator->name, - trav->xlator->name, trav->xlator->name); - priv->favorite_child = child_count; - } - - child_count++; - trav = trav->next; - } - - priv->wait_count = 1; - - priv->child_count = child_count; - - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); + priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count, + gf_afr_mt_int32_t); + if (!priv->shd.pending) + goto out; - priv->child_up = CALLOC (sizeof (unsigned char), child_count); - if (!priv->child_up) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - op_errno = ENOMEM; - goto out; - } - - priv->children = CALLOC (sizeof (xlator_t *), child_count); - if (!priv->children) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - op_errno = ENOMEM; - goto out; - } - - priv->pending_key = CALLOC (sizeof (*priv->pending_key), child_count); - if (!priv->pending_key) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - op_errno = ENOMEM; + priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress), + child_count, gf_afr_mt_shd_bool_t); + if (!priv->shd.inprogress) + goto out; + priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count, + gf_afr_mt_shd_timer_t); + if (!priv->shd.timer) goto out; - } - trav = this->children; - i = 0; - while (i < child_count) { - priv->children[i] = trav->xlator; + priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.healed) + goto out; - asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, - trav->xlator->name); + priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.heal_failed) + goto out; - trav = trav->next; - i++; - } + priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.split_brain) + goto out; - ret = 0; + this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); + if (!this->itable) + goto out; + priv->root_inode = inode_ref (this->itable->root); + GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); + GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); + ret = afr_initialise_statistics (this); + if (ret) + goto out; + ret = 0; out: - return ret; + return ret; } int fini (xlator_t *this) { - return 0; + afr_private_t *priv = NULL; + + priv = this->private; + this->private = NULL; + afr_priv_destroy (priv); + if (this->itable);//I dont see any destroy func + + return 0; } struct xlator_fops fops = { - .lookup = afr_lookup, - .open = afr_open, - .lk = afr_lk, - .flush = afr_flush, - .statfs = afr_statfs, - .fsync = afr_fsync, - .fsyncdir = afr_fsyncdir, - .xattrop = afr_xattrop, - .fxattrop = afr_fxattrop, - .inodelk = afr_inodelk, - .finodelk = afr_finodelk, - .entrylk = afr_entrylk, - .fentrylk = afr_fentrylk, - .checksum = afr_checksum, - - /* inode read */ - .access = afr_access, - .stat = afr_stat, - .fstat = afr_fstat, - .readlink = afr_readlink, - .getxattr = afr_getxattr, - .readv = afr_readv, - - /* inode write */ - .chmod = afr_chmod, - .chown = afr_chown, - .fchmod = afr_fchmod, - .fchown = afr_fchown, - .writev = afr_writev, - .truncate = afr_truncate, - .ftruncate = afr_ftruncate, - .utimens = afr_utimens, - .setxattr = afr_setxattr, - .removexattr = afr_removexattr, - - /* dir read */ - .opendir = afr_opendir, - .readdir = afr_readdir, - .getdents = afr_getdents, - - /* dir write */ - .create = afr_create, - .mknod = afr_mknod, - .mkdir = afr_mkdir, - .unlink = afr_unlink, - .rmdir = afr_rmdir, - .link = afr_link, - .symlink = afr_symlink, - .rename = afr_rename, - .setdents = afr_setdents, + .lookup = afr_lookup, + .open = afr_open, + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, + .fsync = afr_fsync, + .fsyncdir = afr_fsyncdir, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, + .entrylk = afr_entrylk, + .fentrylk = afr_fentrylk, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, + + /* inode read */ + .access = afr_access, + .stat = afr_stat, + .fstat = afr_fstat, + .readlink = afr_readlink, + .getxattr = afr_getxattr, + .fgetxattr = afr_fgetxattr, + .readv = afr_readv, + + /* inode write */ + .writev = afr_writev, + .truncate = afr_truncate, + .ftruncate = afr_ftruncate, + .setxattr = afr_setxattr, + .fsetxattr = afr_fsetxattr, + .setattr = afr_setattr, + .fsetattr = afr_fsetattr, + .removexattr = afr_removexattr, + .fremovexattr = afr_fremovexattr, + + /* dir read */ + .opendir = afr_opendir, + .readdir = afr_readdir, + .readdirp = afr_readdirp, + + /* dir write */ + .create = afr_create, + .mknod = afr_mknod, + .mkdir = afr_mkdir, + .unlink = afr_unlink, + .rmdir = afr_rmdir, + .link = afr_link, + .symlink = afr_symlink, + .rename = afr_rename, }; -struct xlator_mops mops = { +struct xlator_dumpops dumpops = { + .priv = afr_priv_dump, }; struct xlator_cbks cbks = { .release = afr_release, + .releasedir = afr_releasedir, + .forget = afr_forget, }; struct volume_options options[] = { - { .key = {"read-subvolume" }, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"favorite-child"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"data-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"metadata-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"entry-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"data-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"metadata-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"entry-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"data-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 - }, - { .key = {"metadata-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 - }, - { .key = {"entry-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 - }, - { .key = {NULL} }, + { .key = {"read-subvolume" }, + .type = GF_OPTION_TYPE_XLATOR, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. Afr will prefer the one specified using " + "this option if it is not stale. Option value must be " + "one of the xlator names of the children. " + "Ex: <volname>-client-0 till " + "<volname>-client-<number-of-bricks - 1>" + }, + { .key = {"read-subvolume-index" }, + .type = GF_OPTION_TYPE_INT, + .default_value = "-1", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one specified using " + "this option if it is not stale. allowed options" + " include -1 till replica-count - 1" + }, + { .key = {"read-hash-mode" }, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 2, + .default_value = "0", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one computed using " + "the method specified using this option" + "0 = first responder, " + "1 = hash by GFID of file (all clients use " + "same subvolume), " + "2 = hash by GFID of file and client PID", + }, + { .key = {"choose-local" }, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Choose a local subvolume(i.e. Brick) to read from if " + "read-subvolume is not explicitly set.", + }, + { .key = {"favorite-child"}, + .type = GF_OPTION_TYPE_XLATOR, + .description = "If a split-brain happens choose subvol/brick set by " + "this option as source." + }, + { .key = {"background-self-heal-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "16", + .validate = GF_OPT_VALIDATE_MIN, + .description = "This specifies the number of self-heals that can be " + " performed in background without blocking the fop" + }, + { .key = {"data-self-heal"}, + .type = GF_OPTION_TYPE_STR, + .value = {"1", "on", "yes", "true", "enable", + "0", "off", "no", "false", "disable", + "open"}, + .default_value = "on", + .description = "Using this option we can enable/disable data " + "self-heal on the file. \"open\" means data " + "self-heal action will only be triggered by file " + "open operations." + }, + { .key = {"data-self-heal-algorithm"}, + .type = GF_OPTION_TYPE_STR, + .description = "Select between \"full\", \"diff\". The " + "\"full\" algorithm copies the entire file from " + "source to sink. The \"diff\" algorithm copies to " + "sink only those blocks whose checksums don't match " + "with those of source. If no option is configured " + "the option is chosen dynamically as follows: " + "If the file does not exist on one of the sinks " + "or empty file exists or if the source file size is " + "about the same as page size the entire file will " + "be read and written i.e \"full\" algo, " + "otherwise \"diff\" algo is chosen.", + .value = { "diff", "full"} + }, + { .key = {"data-self-heal-window-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, + .default_value = "1", + .description = "Maximum number blocks per file for which self-heal " + "process would be applied simultaneously." + }, + { .key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Using this option we can enable/disable metadata " + "i.e. Permissions, ownerships, xattrs self-heal on " + "the file/directory." + }, + { .key = {"entry-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Using this option we can enable/disable entry " + "self-heal on the directory." + }, + { .key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Data fops like write/truncate will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" + }, + { .key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Metadata fops like setattr/setxattr will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" + }, + { .key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Entry fops like create/unlink will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" + }, + { .key = {"optimistic-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Entry/Metadata fops will not perform " + "pre fop changelog operations in afr transaction " + "if this option is enabled." + }, + { .key = {"strict-readdir"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + { .key = {"inodelk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs inode lock/unlocks" + }, + { .key = {"entrylk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs entry lock/unlocks" + }, + { .key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Lock phase of a transaction has two sub-phases. " + "First is an attempt to acquire locks in parallel by " + "broadcasting non-blocking lock requests. If lock " + "aquistion fails on any server, then the held locks " + "are unlocked and revert to a blocking locked mode " + "sequentially on one server after another. If this " + "option is enabled the initial broadcasting lock " + "request attempt to acquire lock on the entire file. " + "If this fails, we revert back to the sequential " + "\"regional\" blocking lock as before. In the case " + "where such an \"eager\" lock is granted in the " + "non-blocking phase, it gives rise to an opportunity " + "for optimization. i.e, if the next write transaction " + "on the same FD arrives before the unlock phase of " + "the first transaction, it \"takes over\" the full " + "file lock. Similarly if yet another data transaction " + "arrives before the unlock phase of the \"optimized\" " + "transaction, that in turn \"takes over\" the lock as " + "well. The actual unlock now happens at the end of " + "the last \"optimzed\" transaction." + + }, + { .key = {"self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option applies to only self-heal-daemon. " + "Index directory crawl and automatic healing of files" + "will not be performed if this option is turned off." + }, + { .key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of self-heal-daemon " + "or not." + }, + { .key = {"quorum-type"}, + .type = GF_OPTION_TYPE_STR, + .value = { "none", "auto", "fixed"}, + .default_value = "none", + .description = "If value is \"fixed\" only allow writes if " + "quorum-count bricks are present. If value is " + "\"auto\" only allow writes if more than half of " + "bricks, or exactly half including the first, are " + "present.", + }, + { .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = INT_MAX, + .default_value = 0, + .description = "If quorum-type is \"fixed\" only allow writes if " + "this many bricks or present. Other quorum types " + "will OVERWRITE this value.", + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "Local glusterd uuid string, used in starting " + "self-heal-daemon so that it can crawl only on " + "local index directories.", + }, + { .key = {"heal-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 60, + .max = INT_MAX, + .default_value = "600", + .description = "time interval for checking the need to self-heal " + "in self-heal-daemon" + }, + { .key = {"post-op-delay-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "1", + .description = "Time interval induced artificially before " + "post-operation phase of the transaction to " + "enhance overlap of adjacent write operations.", + }, + { .key = {AFR_SH_READDIR_SIZE_KEY}, + .type = GF_OPTION_TYPE_SIZET, + .description = "readdirp size for performing entry self-heal", + .min = 1024, + .max = 131072, + .default_value = "1KB", + }, + { .key = {"readdir-failover"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "readdir(p) will not failover if this option is off", + .default_value = "on", + }, + { .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", + }, + { .key = {NULL} }, }; |
