diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-metadata.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 1255 |
1 files changed, 504 insertions, 751 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index de8fa1a3914..03f43bad16e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,793 +1,546 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - -int -afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); - memset (sh->success, 0, sizeof (int) * priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_WARNING, - "aborting selfheal of %s", - local->loc.path); - sh->completion_cbk (frame, this); - } else { - if (S_ISREG (local->cont.lookup.buf.st_mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to data check on %s", - local->loc.path); - afr_self_heal_data (frame, this); - return 0; - } - - if (S_ISDIR (local->cont.lookup.buf.st_mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_DEBUG, - "completed self heal of %s", - local->loc.path); - - sh->completion_cbk (frame, this); - } - - return 0; -} - - -int -afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - int call_count = 0; - - - local = frame->local; - - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_done (frame, this); - - return 0; -} - - -int -afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; - struct flock flock = {0, }; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = local->child_count; - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - flock.l_start = 0; - flock.l_len = 0; - flock.l_type = F_UNLCK; - - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND (frame, afr_sh_metadata_unlck_cbk, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, - &local->loc, F_SETLK, &flock); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_finish (frame, this); - - return 0; -} - - -int -afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - - afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, - sh->success, priv->child_count); - - erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, - priv->child_count, AFR_METADATA_PENDING); - - local->call_count = call_count; - - if (call_count == 0) { - gf_log (this->name, GF_LOG_WARNING, - "metadata of %s not healed on any subvolume", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - } - - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_DEBUG, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - FREE (erase_xattr); - - return 0; -} - - -int -afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "setting attributes failed for %s on %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->success[child_index] = 0; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include <glusterfs/events.h> - if (call_count == 0) - afr_sh_metadata_erase_pending (frame, this); +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE) - return 0; -} - - -int -afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +static gf_boolean_t +_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata) { - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; + return afr_is_xattr_ignorable(k); } - -int -afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +void +afr_delete_ignorable_xattrs(dict_t *xattr) { - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; + dict_foreach_match(xattr, _afr_ignorable_key_match, NULL, + dict_remove_foreach_fn, NULL); } - int -afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int active_sinks = 0; - int call_count = 0; - int i = 0; - struct timespec ts[2]; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - active_sinks = sh->active_sinks; - - /* - * 4 calls per sink - chown, chmod, utimes, setxattr - */ - if (xattr) - call_count = active_sinks * 4; - else - call_count = active_sinks * 3; - - local->call_count = call_count; - -#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC - ts[0] = sh->buf[source].st_atim; - ts[1] = sh->buf[source].st_mtim; -#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC - ts[0] = sh->buf[source].st_atimespec; - ts[1] = sh->buf[source].st_mtimespec; -#else - ts[0].tv_sec = sh->buf[source].st_atime; - ts[1].tv_sec = sh->buf[source].st_mtime; -#endif - - for (i = 0; i < priv->child_count; i++) { - if (call_count == 0) { - break; - } - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_DEBUG, - "syncing metadata of %s from %s to %s", - local->loc.path, priv->children[source]->name, - priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->chown, - &local->loc, - sh->buf[source].st_uid, - sh->buf[source].st_gid); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->chmod, - &local->loc, sh->buf[source].st_mode); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->utimens, - &local->loc, ts); - - call_count = call_count - 3; - - if (!xattr) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, xattr, 0); - call_count--; - } - - return 0; + int ret = -1; + loc_t loc = { + 0, + }; + dict_t *xattr = NULL; + dict_t *old_xattr = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing metadata selfheal on %s", uuid_utoa(inode->gfid)); + + ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL, + NULL); + if (ret < 0) { + ret = -EIO; + goto out; + } + + afr_delete_ignorable_xattrs(xattr); + + for (i = 0; i < priv->child_count; i++) { + if (old_xattr) { + dict_unref(old_xattr); + old_xattr = NULL; + } + + if (!healed_sinks[i]) + continue; + + ret = syncop_setattr(priv->children[i], &loc, + &locked_replies[source].poststat, AFR_HEAL_ATTR, + NULL, NULL, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + + ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL, + NULL); + if (old_xattr) { + afr_delete_ignorable_xattrs(old_xattr); + ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr, + NULL); + if (ret) + healed_sinks[i] = 0; + } + + ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + } + ret = 0; + +out: + loc_wipe(&loc); + if (xattr) + dict_unref(xattr); + if (old_xattr) + dict_unref(old_xattr); + + return ret; } - -int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +static uint64_t +mtime_ns(struct iatt *ia) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", - local->loc.path, priv->children[source]->name, - strerror (op_errno)); - - afr_sh_metadata_sync (frame, this, NULL); - } else { - dict_del (xattr, AFR_DATA_PENDING); - dict_del (xattr, AFR_METADATA_PENDING); - dict_del (xattr, AFR_ENTRY_PENDING); - afr_sh_metadata_sync (frame, this, xattr); - } - - return 0; -} + uint64_t ret; + ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) + + (uint64_t)(ia->ia_mtime_nsec); -int -afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_metadata_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - gf_log (this->name, GF_LOG_DEBUG, - "syncing metadata of %s from subvolume %s to %d active sinks", - local->loc.path, priv->children[source]->name, active_sinks); - - STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, - priv->children[source], - priv->children[source]->fops->getxattr, - &local->loc, NULL); - - return 0; + return ret; } - -int -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +/* + * When directory content is modified, [mc]time is updated. On + * Linux, the filesystem does it, while at least on NetBSD, the + * kernel file-system independent code does it. This means that + * when entries are added while bricks are down, the kernel sends + * a SETATTR [mc]time which will cause metadata split brain for + * the directory. In this case, clear the split brain by finding + * the source with the most recent modification date. + */ +static int +afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, - priv->child_count, AFR_METADATA_PENDING); - - afr_sh_print_pending_matrix (sh->pending_matrix, this); - - afr_sh_mark_sources (sh->pending_matrix, sh->sources, - priv->child_count); - - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); - - nsources = afr_sh_source_count (sh->sources, priv->child_count); - - if ((nsources == 0) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_WARNING, - "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == 0) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to resolve conflicting metadata of %s. " - "Please resolve manually by fixing the " - "permissions/ownership of %s on your subvolumes. " - "You can also consider 'option favorite-child <>'", - local->loc.path, local->loc.path); - - local->govinda_gOvinda = 1; - - afr_sh_metadata_finish (frame, this); - return 0; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - sh->source = source; - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - - if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - afr_sh_metadata_sync_prepare (frame, this); - - return 0; + afr_private_t *priv = NULL; + int source = -1; + struct iatt source_ia; + struct iatt child_ia; + uint64_t mtime = 0; + int i; + int ret = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + if (mtime_ns(&replies[i].poststat) <= mtime) + continue; + + mtime = mtime_ns(&replies[i].poststat); + source = i; + } + + if (source == -1) + goto out; + + source_ia = replies[source].poststat; + if (source_ia.ia_type != IA_IFDIR) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + child_ia = replies[i].poststat; + + if (!IA_EQUAL(source_ia, child_ia, gfid) || + !IA_EQUAL(source_ia, child_ia, type) || + !IA_EQUAL(source_ia, child_ia, prot) || + !IA_EQUAL(source_ia, child_ia, uid) || + !IA_EQUAL(source_ia, child_ia, gid) || + !afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) + goto out; + } + + /* + * Metadata split brain is just about [amc]time + * We return our source. + */ + ret = source; +out: + return ret; } - -int -afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) +static int +__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, + struct afr_reply *replies, + unsigned char *sources) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->st_mode); - - sh->buf[child_index] = *buf; - if (xattr) - sh->xattr[child_index] = dict_ref (xattr); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_fix (frame, this); - - return 0; + int ret = 0; + int i = 0; + int m_idx = 0; + afr_private_t *priv = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = {0}; + dict_t *xattr = NULL; + + priv = this->private; + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + raw[m_idx] = 1; + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) + continue; + ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + ret = -1; + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO, + "Failed to set pending metadata xattr on child %d for %s", i, + uuid_utoa(inode->gfid)); + goto out; + } + } + + afr_replies_wipe(replies, priv->child_count); + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + +out: + if (xattr) + dict_unref(xattr); + return ret; } - -int -afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) +/* + * Look for mismatching uid/gid or mode or user xattrs even if + * AFR xattrs don't say so, and pick one arbitrarily as winner. */ + +static int +__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; - dict_t *xattr_req = NULL; - int ret = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = local->child_count; - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) - ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING, - priv->child_count * sizeof(int32_t)); - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; + int i = 0; + afr_private_t *priv = NULL; + struct iatt srcstat = { + 0, + }; + int source = -1; + int sources_count = 0; + int ret = 0; + + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count) { + source = afr_mark_split_brain_source_sinks( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, AFR_METADATA_TRANSACTION); + if (source >= 0) { + _afr_fav_child_reset_sink_xattrs( + frame, this, inode, source, healed_sinks, undid_pending, + AFR_METADATA_TRANSACTION, locked_on, replies); + goto out; + } + + /* If this is a directory mtime/ctime only split brain + use the most recent */ + source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on); + if (source != -1) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN, + "clear time " + "split brain on %s", + uuid_utoa(replies[source].poststat.ia_gfid)); + sources[source] = 1; + healed_sinks[source] = 0; + goto out; + } + + if (!priv->metadata_splitbrain_forced_heal) { + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;" + "type=metadata;file=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); + return -EIO; + } + + /* Metadata split brain, select one subvol + arbitrarily */ + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i] && healed_sinks[i]) { + sources[i] = 1; + healed_sinks[i] = 0; + break; + } + } + } + + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + + source = afr_choose_source_by_policy(priv, sources, + AFR_METADATA_TRANSACTION); + srcstat = replies[source].poststat; + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] || i == source) + continue; + if (!IA_EQUAL(srcstat, replies[i].poststat, type) || + !IA_EQUAL(srcstat, replies[i].poststat, uid) || + !IA_EQUAL(srcstat, replies[i].poststat, gid) || + !IA_EQUAL(srcstat, replies[i].poststat, prot)) { + gf_msg_debug(this->name, 0, + "%s: iatt mismatch " + "for source(%d) vs (%d)", + uuid_utoa(replies[source].poststat.ia_gfid), source, + i); + sources[i] = 0; + healed_sinks[i] = 1; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] || i == source) + continue; + if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) { + gf_msg_debug(this->name, 0, + "%s: xattr mismatch " + "for source(%d) vs (%d)", + uuid_utoa(replies[source].poststat.ia_gfid), source, + i); + sources[i] = 0; + healed_sinks[i] = 1; + } + } + if ((sources_count == priv->child_count) && (source > -1) && + (AFR_COUNT(healed_sinks, priv->child_count) != 0)) { + ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode, + replies, sources); + if (ret < 0) + return ret; + } +out: + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return source; } - int -afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *pflag) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; - - /* TODO: what if lock fails? */ - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - sh->op_failed = 1; - - gf_log (this->name, - (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), - "locking of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_metadata_finish (frame, this); - return 0; - } - - afr_sh_metadata_lookup (frame, this); - } - - return 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + uint64_t *witness = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + if (ret) + return ret; + + witness = alloca0(sizeof(*witness) * priv->child_count); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_METADATA_TRANSACTION, locked_on, + sources, sinks, witness, pflag); + if (ret) + return ret; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + /* If any source has witness, pick first + * witness source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && witness[i]) { + source = i; + break; + } + } + + if (source != -1) { + for (i = 0; i < priv->child_count; i++) { + if (i != source && sources[i]) { + sources[i] = 0; + healed_sinks[i] = 1; + } + } + } + + source = __afr_selfheal_metadata_finalize_source( + frame, this, inode, sources, sinks, healed_sinks, undid_pending, + locked_on, replies); + + if (source < 0) + return -EIO; + + return source; } - int -afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; - struct flock flock = {0, }; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = local->child_count; - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - flock.l_start = 0; - flock.l_len = 0; - flock.l_type = F_WRLCK; - - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "locking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, - &local->loc, F_SETLK, &flock); - - if (!--call_count) - break; - } - } - - return 0; + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + gf_boolean_t did_sh = _gf_true; + int source = -1; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + { + if (ret < priv->child_count) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_metadata_prepare( + frame, this, inode, data_lock, sources, sinks, healed_sinks, + undid_pending, locked_replies, NULL); + if (ret < 0) + goto unlock; + + source = ret; + + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; + } + + ret = __afr_selfheal_metadata_do(frame, this, inode, source, + healed_sinks, locked_replies); + if (ret) + goto unlock; + + afr_selfheal_restore_time(frame, this, inode, source, healed_sinks, + locked_replies); + + ret = afr_selfheal_undo_pending( + frame, this, inode, sources, sinks, healed_sinks, undid_pending, + AFR_METADATA_TRANSACTION, locked_replies, data_lock); + } +unlock: + afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + + if (did_sh) + afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources, + healed_sinks); + else + ret = 1; + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + return ret; } - int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - - - local = frame->local; - sh = &local->self_heal; - - if (local->need_metadata_self_heal && priv->metadata_self_heal) { - afr_sh_metadata_lock (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to data check on %s", - local->loc.path); - afr_sh_metadata_done (frame, this); - } - - return 0; + inode_t *inode = NULL; + inode_t *link_inode = NULL; + call_frame_t *frame = NULL; + int ret = 0; + + if (gf_uuid_is_null(stbuf->ia_gfid)) { + ret = -EINVAL; + goto out; + } + + inode = inode_new(this->itable); + if (!inode) { + ret = -ENOMEM; + goto out; + } + + link_inode = inode_link(inode, NULL, NULL, stbuf); + if (!link_inode) { + ret = -ENOMEM; + goto out; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = afr_selfheal_metadata(frame, this, link_inode); +out: + if (inode) + inode_unref(inode); + if (link_inode) + inode_unref(link_inode); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; } - |
