diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-data.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 1841 |
1 files changed, 763 insertions, 1078 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 75090ad79ae..37bcc2b3f9e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,1206 +1,891 @@ /* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include "afr-messages.h" +#include <glusterfs/events.h> - -int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) +#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) +static int +__checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - /* - TODO: cleanup sh->* - */ - - if (sh->healing_fd && !sh->healing_fd_opened) { - /* unref only if we created the fd ourselves */ - - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; + afr_local_t *local = NULL; + struct afr_reply *replies = NULL; + int i = (long)cookie; + + local = frame->local; + replies = local->replies; + + replies[i].valid = 1; + replies[i].op_ret = op_ret; + replies[i].op_errno = op_errno; + if (xdata) { + replies[i].buf_has_zeroes = dict_get_str_boolean( + xdata, "buf-has-zeroes", _gf_false); + replies[i].fips_mode_rchecksum = dict_get_str_boolean( + xdata, "fips-mode-rchecksum", _gf_false); + } + if (strong) { + if (replies[i].fips_mode_rchecksum) { + memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH); + } else { + memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); } + } - for (i = 0; i < priv->child_count; i++) - sh->locked_nodes[i] = 0; - - gf_log (this->name, GF_LOG_TRACE, - "self heal of %s completed", - local->loc.path); - - sh->completion_cbk (frame, this); - - return 0; -} - - -int -afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "flush or setattr failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_done (frame, this); - } - - return 0; + syncbarrier_wake(&local->barrier); + return 0; } - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *statpre, struct stat *statpost) +static gf_boolean_t +__afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, struct iatt *poststat) { - afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - int i = 0; - int call_count = 0; - int source = 0; - int active_sinks = 0; - int32_t valid = 0; - - struct stat stbuf = {0,}; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - active_sinks = sh->active_sinks; - - valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - -#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC - stbuf.st_atim = sh->buf[source].st_atim; - stbuf.st_mtim = sh->buf[source].st_mtim; - -#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC - stbuf.st_atimespec = sh->buf[source].st_atimespec; - stbuf.st_mtimespec = sh->buf[source].st_mtimespec; -#else - stbuf.st_atime = sh->buf[source].st_atime; - stbuf.st_mtime = sh->buf[source].st_mtime; -#endif - - if (sh->healing_fd_opened) { - /* not our job to close the fd */ - - afr_sh_data_done (frame, this); - return 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + unsigned char *wind_subvols = NULL; + gf_boolean_t checksum_match = _gf_true; + struct afr_reply *replies = NULL; + dict_t *xdata = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + replies = local->replies; + + xdata = dict_new(); + if (!xdata) + goto out; + if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) { + dict_unref(xdata); + goto out; + } + + wind_subvols = alloca0(priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (i == source || healed_sinks[i]) + wind_subvols[i] = 1; + } + + AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size, + xdata); + if (xdata) + dict_unref(xdata); + + if (!replies[source].valid || replies[source].op_ret != 0) + return _gf_false; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + if (replies[i].valid) { + if (memcmp(replies[source].checksum, replies[i].checksum, + replies[source].fips_mode_rchecksum + ? SHA256_DIGEST_LENGTH + : MD5_DIGEST_LENGTH)) { + checksum_match = _gf_false; + break; + } } - - if (!sh->healing_fd) { - afr_sh_data_done (frame, this); - return 0; - } - - call_count = (sh->active_sinks + 1) * 2; - local->call_count = call_count; - - /* closed source */ - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[sh->source]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->flush, - sh->healing_fd); - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->setattr, - &local->loc, &stbuf, valid); - - call_count--; - - if (call_count == 0) - return 0; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - sh->healing_fd); - - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); - - if (!--call_count) - break; - } - - return 0; + } + + if (checksum_match) { + if (HAS_HOLES(poststat)) + return _gf_true; + + /* For non-sparse files, we might be better off writing the + * zeroes to sinks to avoid mismatch of disk-usage in bricks. */ + if (local->replies[source].buf_has_zeroes) + return _gf_false; + else + return _gf_true; + } +out: + return _gf_false; } - -int -afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static gf_boolean_t +__afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset, + int sink) { - afr_local_t * local = NULL; - int call_count = 0; - int child_index = (long) cookie; - - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_close (frame, this); - } - - return 0; + afr_private_t *priv = NULL; + struct iobref *iobref = NULL; + struct iovec *iovec = NULL; + int count = 0; + int ret = 0; + gf_boolean_t zero_filled = _gf_false; + + priv = this->private; + ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec, + &count, &iobref, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = iov_0filled(iovec, count); + if (!ret) + zero_filled = _gf_true; +out: + if (iovec) + GF_FREE(iovec); + if (iobref) + iobref_unref(iobref); + return zero_filled; } - -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct afr_reply *replies, int type) { - struct flock flock; - int i = 0; - int call_count = 0; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->data_lock_held) { - /* not our job to unlock, proceed to close */ - - afr_sh_data_close (frame, this); - return 0; - } + struct iovec *iovec = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec, + &count, &iobref, NULL, NULL, NULL); + if (ret <= 0) + return ret; - for (i = 0; i < priv->child_count; i++) { - if (sh->locked_nodes[i]) - call_count++; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + /* + * TODO: Use fiemap() and discard() to heal holes + * in the future. + * + * For now, + * + * - if the source had any holes at all, + * AND + * - if we are writing past the original file size + * of the sink + * AND + * - is NOT the last block of the source file. if + * the block contains EOF, it has to be written + * in order to set the file size even if the + * last block is 0-filled. + * AND + * - if the read buffer is filled with only 0's + * + * then, skip writing to this source. We don't depend + * on the write to happen to update the size as we + * have performed an ftruncate() upfront anyways. + */ +#define is_last_block(o, b, s) ((s >= o) && (s <= (o + b))) + if (HAS_HOLES((&replies[source].poststat)) && + offset >= replies[i].poststat.ia_size && + !is_last_block(offset, size, replies[source].poststat.ia_size) && + (iov_0filled(iovec, count) == 0)) + continue; + + /* Avoid filling up sparse regions of the sink with 0-filled + * writes.*/ + if (type == AFR_SELFHEAL_DATA_FULL && + HAS_HOLES((&replies[source].poststat)) && + ((offset + size) <= replies[i].poststat.ia_size) && + (iov_0filled(iovec, count) == 0) && + __afr_is_sink_zero_filled(this, fd, size, offset, i)) { + continue; } - if (call_count == 0) { - afr_sh_data_close (frame, this); - return 0; + ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref, + 0, NULL, NULL, NULL, NULL); + if (ret != iov_length(iovec, count)) { + /* write() failed on this sink. unset the corresponding + member in sinks[] (which is healed_sinks[] in the + caller) so that this server does NOT get considered + as successfully healed. + */ + healed_sinks[i] = 0; } + } + if (iovec) + GF_FREE(iovec); + if (iobref) + iobref_unref(iobref); - local->call_count = call_count; - - flock.l_start = 0; - flock.l_len = 0; - flock.l_type = F_UNLCK; - - for (i = 0; i < priv->child_count; i++) { - if (sh->locked_nodes[i]) { - gf_log (this->name, GF_LOG_TRACE, - "unlocking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, - &local->loc, F_SETLK, &flock); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "finishing data selfheal of %s", local->loc.path); - - afr_sh_data_unlock (frame, this); - - return 0; + return ret; } - -int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) +static gf_boolean_t +afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source, + unsigned char *healed_sinks) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; + afr_private_t *priv = this->private; + int i = 0; - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); + if (!locked_on[source]) + return _gf_false; - call_count = afr_frame_return (frame); + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && locked_on[i]) + return _gf_true; + } - if (call_count == 0) - afr_sh_data_finish (frame, this); - - return 0; + return _gf_false; } - -int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, off_t offset, + size_t size, int type, struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_DATA_TRANSACTION); - - erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - FREE (erase_xattr); - - return 0; -} + int ret = -1; + afr_private_t *priv = NULL; + unsigned char *data_lock = NULL; + + priv = this->private; + data_lock = alloca0(priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + { + if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) { + ret = -ENOTCONN; + goto unlock; + } + if (type == AFR_SELFHEAL_DATA_DIFF && + __afr_can_skip_data_block_heal(frame, this, fd, source, + healed_sinks, offset, size, + &replies[source].poststat)) { + ret = 0; + goto unlock; + } -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *prebuf, - struct stat *postbuf) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) - gf_log (this->name, GF_LOG_DEBUG, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - else - gf_log (this->name, GF_LOG_TRACE, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - } - - return 0; + ret = __afr_selfheal_data_read_write( + frame, this, fd, source, healed_sinks, offset, size, replies, type); + } +unlock: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + return ret; } - -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + local = frame->local; + priv = this->private; - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sources = sh->sources; - call_count = sh->active_sinks; - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size); + if (!priv->ensure_durability) + return 0; - if (!--call_count) - break; - } + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL); - return 0; + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret != 0) + /* fsync() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } - -static struct afr_sh_algorithm * -sh_algo_from_name (xlator_t *this, char *name) +static int +afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks, + int source, struct afr_reply *replies) { - int i = 0; - - while (afr_self_heal_algorithms[i].name) { - if (!strcmp (name, afr_self_heal_algorithms[i].name)) { - return &afr_self_heal_algorithms[i]; - } + int type = AFR_SELFHEAL_DATA_FULL; + int i = 0; - i++; + if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) { + type = AFR_SELFHEAL_DATA_FULL; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] && i != source) + continue; + if (replies[i].poststat.ia_size) { + type = AFR_SELFHEAL_DATA_DIFF; + break; + } } - - return NULL; + } else { + type = priv->data_self_heal_algorithm; + } + return type; } - static int -sh_zero_byte_files_exist (afr_self_heal_t *sh, int child_count) +afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + unsigned char *healed_sinks, struct afr_reply *replies) { - int i; - int ret = 0; - - for (i = 0; i < child_count; i++) { - if (sh->buf[i].st_size == 0) { - ret = 1; - break; - } + afr_private_t *priv = NULL; + off_t off = 0; + size_t block = 0; + int type = AFR_SELFHEAL_DATA_FULL; + int ret = -1; + call_frame_t *iter_frame = NULL; + unsigned char arbiter_sink_status = 0; + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing data selfheal on %s", uuid_utoa(fd->inode->gfid)); + + priv = this->private; + if (priv->arbiter_count) { + arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX]; + healed_sinks[ARBITER_BRICK_INDEX] = 0; + } + + block = 128 * 1024 * priv->data_self_heal_window_size; + + type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies); + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + ret = -ENOMEM; + goto out; + } + + for (off = 0; off < replies[source].poststat.ia_size; off += block) { + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + ret = -ENOTCONN; + goto out; } - return ret; -} - + ret = afr_selfheal_data_block(iter_frame, this, fd, source, + healed_sinks, off, block, type, replies); + if (ret < 0) + goto out; -struct afr_sh_algorithm * -afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - struct afr_sh_algorithm * algo = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - algo = sh_algo_from_name (this, priv->data_self_heal_algorithm); - - if (algo == NULL) { - /* option not set, so fall back on heuristics */ - - if ((local->enoent_count != 0) - || sh_zero_byte_files_exist (sh, priv->child_count) - || (sh->file_size <= (priv->data_self_heal_window_size * this->ctx->page_size))) { - - /* - * If the file does not exist on one of the subvolumes, - * or a zero-byte file exists (created by entry self-heal) - * the entire content has to be copied anyway, so there - * is no benefit from using the "diff" algorithm. - * - * If the file size is about the same as page size, - * the entire file can be read and written with a few - * (pipelined) STACK_WINDs, which will be faster - * than "diff" which has to read checksums and then - * read and write. - */ - - algo = sh_algo_from_name (this, "full"); - - } else { - algo = sh_algo_from_name (this, "diff"); - } + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = -ENOTCONN; + goto out; } + } - return algo; -} + ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks); +out: + if (arbiter_sink_status) + healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status; -int -afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - struct afr_sh_algorithm *sh_algo = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[source]->name, active_sinks); - - sh->algo_completion_cbk = afr_sh_data_trim_sinks; - sh->algo_abort_cbk = afr_sh_data_finish; - - sh_algo = afr_sh_data_pick_algo (frame, this); - - sh_algo->fn (frame, this); - - return 0; + if (iter_frame) + AFR_STACK_DESTROY(iter_frame); + return ret; } - -int -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks, uint64_t size) { - afr_local_t *local = NULL; - afr_local_t * orig_local = NULL; - - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - afr_sh_print_pending_matrix (sh->pending_matrix, this); - - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_DATA); - - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); - - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_data_finish (frame, this); - return 0; - } - - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal contents of '%s' (possible split-brain). " - "Please delete the file from all but the preferred " - "subvolume.", local->loc.path); - - local->govinda_gOvinda = 1; - - afr_sh_data_finish (frame, this); - return 0; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_data_finish (frame, this); - return 0; - } - - sh->source = source; - sh->block_size = 65536; - sh->file_size = sh->buf[source].st_size; - - if (FILE_HAS_HOLES (&sh->buf[source])) - sh->file_has_holes = 1; - - orig_local = sh->orig_frame->local; - orig_local->cont.lookup.buf.st_size = sh->buf[source].st_size; - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - afr_set_read_child (this, local->loc.inode, sh->source); - - if (sh->background) { - sh->unwind (sh->orig_frame, this); - sh->unwound = _gf_true; - } - - afr_sh_data_sync_prepare (frame, this); - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + /* This will send truncate on the arbiter brick as well if it is marked as + * sink. If changelog is enabled on the volume it captures truncate as a + * data transactions on the arbiter brick. This will help geo-rep to + * properly sync the data from master to slave if arbiter is the ACTIVE + * brick during syncing and which had got some entries healed for data as + * part of self heal. + */ + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size, + NULL); + + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret == -1) + /* truncate() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + + return 0; } - -int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr) +gf_boolean_t +afr_has_source_witnesses(xlator_t *this, unsigned char *sources, + uint64_t *witness) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - int nsources = 0; - int source = 0; - int i = 0; + int i = 0; + afr_private_t *priv = NULL; - sh = &local->self_heal; - priv = this->private; + priv = this->private; - sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count); - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = CALLOC (sizeof (int32_t), - priv->child_count); - } - - sh->sources = CALLOC (priv->child_count, sizeof (*sh->sources)); - - afr_sh_build_pending_matrix (priv, sh->pending_matrix, xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_DATA); - - source = afr_sh_select_source (sh->sources, priv->child_count); - - return source; + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && witness[i]) + return _gf_true; + } + return _gf_false; } - -int -afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) +static gf_boolean_t +afr_does_size_mismatch(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; + int i = 0; + afr_private_t *priv = NULL; + struct iatt *min = NULL; + struct iatt *max = NULL; - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fstat of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); + priv = this->private; - sh->buf[child_index] = *buf; - } - } - UNLOCK (&frame->lock); + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - call_count = afr_frame_return (frame); + if (replies[i].op_ret < 0) + continue; - if (call_count == 0) { - afr_sh_data_fix (frame, this); - } + if (!sources[i]) + continue; - return 0; -} + if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0)) + continue; + if (!min) + min = &replies[i].poststat; -int -afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + if (!max) + max = &replies[i].poststat; - int call_count = 0; - int i = 0; + if (min->ia_size > replies[i].poststat.ia_size) + min = &replies[i].poststat; - priv = this->private; - local = frame->local; - sh = &local->self_heal; + if (max->ia_size < replies[i].poststat.ia_size) + max = &replies[i].poststat; + } - call_count = afr_up_children_count (priv->child_count, - local->child_up); + if (min && max) { + if (min->ia_size != max->ia_size) + return _gf_true; + } - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fstat, - sh->healing_fd); - - if (!--call_count) - break; - } - } - - return 0; + return _gf_false; } - -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) +static void +afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources, + uint64_t *witness) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fxattrop of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->xattr[child_index] = dict_ref (xattr); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_fstat (frame, this); - } - - return 0; + int i = 0; + afr_private_t *priv = NULL; + uint64_t biggest_witness = 0; + + priv = this->private; + /* Find source with biggest witness count */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (biggest_witness < witness[i]) + biggest_witness = witness[i]; + } + + /* Mark files with less witness count as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (witness[i] < biggest_witness) + sources[i] = 0; + } + + return; } - -int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) +/* This is a tie breaker function. Only one source be assigned here */ +static void +afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - - int32_t zero_pending[3] = {0, 0, 0}; - - int call_count = 0; - int i = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - xattr_req = dict_new(); - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin (xattr_req, priv->pending_key[i], - zero_pending, 3 * sizeof(int32_t)); - } + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + uint32_t max_ctime = 0; + + priv = this->private; + /* Find source with latest ctime */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + + if (max_ctime <= replies[i].poststat.ia_ctime) { + source = i; + max_ctime = replies[i].poststat.ia_ctime; } + } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req); - - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; + /* Only mark one of the files as source to break ties */ + memset(sources, 0, sizeof(*sources) * priv->child_count); + sources[source] = 1; } - -int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, int child_index); - -int -afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static int +__afr_selfheal_data_finalize_source( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - - /* TODO: what if lock fails? */ - - local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - sh->locked_nodes[child_index] = 0; - - gf_log (this->name, GF_LOG_DEBUG, - "locking of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - sh->locked_nodes[child_index] = 1; - sh->lock_count++; - - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); - - afr_sh_data_lock_rec (frame, this, child_index + 1); - - return 0; -} + afr_private_t *priv = NULL; + int source = -1; + int sources_count = 0; + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count) { + /* split brain */ + source = afr_mark_split_brain_source_sinks( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, AFR_DATA_TRANSACTION); + if (source < 0) { + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=data;" + "file=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); + return -EIO; + } + _afr_fav_child_reset_sink_xattrs( + frame, this, inode, source, healed_sinks, undid_pending, + AFR_DATA_TRANSACTION, locked_on, replies); + goto out; + } + + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + + /* If there are no witnesses/size-mismatches on sources we are done*/ + if (!afr_does_size_mismatch(this, sources, replies) && + !afr_has_source_witnesses(this, sources, witness)) + goto out; + + afr_mark_largest_file_as_source(this, sources, replies); + afr_mark_biggest_witness_as_source(this, sources, witness); + afr_mark_newest_file_as_source(this, sources, replies); + if (priv->arbiter_count) + /* Choose non-arbiter brick as source for empty files. */ + afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks, + locked_on, replies, + AFR_DATA_TRANSACTION); + +out: + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION); + + return source; +} +/* + * __afr_selfheal_data_prepare: + * + * This function inspects the on-disk xattrs and determines which subvols + * are sources and sinks. + * + * The return value is the index of the subvolume to be used as the source + * for self-healing, or -1 if no healing is necessary/split brain. + */ int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, int child_index) +__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *pflag) { - struct flock flock; - int i = 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + uint64_t *witness = NULL; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; + priv = this->private; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); - flock.l_start = 0; - flock.l_len = 0; - flock.l_type = F_WRLCK; - - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; - - if ((child_index == priv->child_count) && - sh->lock_count == 0) { - - gf_log (this->name, GF_LOG_DEBUG, - "unable to lock on even one child"); - - afr_sh_data_done (frame, this); - return 0; - } - - if ((child_index == priv->child_count) - || (sh->lock_count == afr_lock_server_count (priv, AFR_DATA_TRANSACTION))) { - afr_sh_data_fxattrop (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "locking %s on subvolume %s", - local->loc.path, priv->children[i]->name); + if (ret) + return ret; - STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk, - (void *) (long) child_index, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, - &local->loc, F_SETLKW, &flock); + witness = alloca0(priv->child_count * sizeof(*witness)); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_DATA_TRANSACTION, locked_on, sources, + sinks, witness, pflag); + if (ret) + return ret; - return 0; + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + source = __afr_selfheal_data_finalize_source( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + undid_pending, replies, witness); + if (source < 0) + return -EIO; + + return source; } - -int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; - - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + gf_boolean_t did_sh = _gf_true; + gf_boolean_t is_arbiter_the_only_sink = _gf_false; + gf_boolean_t empty_file = _gf_false; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0, + data_lock); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "self-heal as only %d number " + "of subvolumes " + "could be locked", + uuid_utoa(fd->inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } - if (sh->data_lock_held) { - /* caller has held the lock already, - so skip locking */ + ret = __afr_selfheal_data_prepare(frame, this, fd->inode, data_lock, + sources, sinks, healed_sinks, + undid_pending, locked_replies, NULL); + if (ret < 0) + goto unlock; - afr_sh_data_fxattrop (frame, this); - return 0; + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; } - for (i = 0; i < priv->child_count; i++) - sh->locked_nodes[i] = 0; + source = ret; - return afr_sh_data_lock_rec (frame, this, 0); -} + if (AFR_IS_ARBITER_BRICK(priv, source)) { + empty_file = afr_is_file_empty_on_all_children(priv, + locked_replies); + if (empty_file) + goto restore_time; + did_sh = _gf_false; + goto unlock; + } -int -afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, - "open of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } - - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_data_finish (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - afr_sh_data_lock (frame, this); - } - - return 0; + ret = __afr_selfheal_truncate_sinks( + frame, this, fd, healed_sinks, + locked_replies[source].poststat.ia_size); + if (ret < 0) + goto unlock; + + if (priv->arbiter_count && + AFR_COUNT(healed_sinks, priv->child_count) == 1 && + healed_sinks[ARBITER_BRICK_INDEX]) { + is_arbiter_the_only_sink = _gf_true; + goto restore_time; + } + ret = 0; + } +unlock: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock); + if (ret < 0) + goto out; + + if (!did_sh) + goto out; + + ret = afr_selfheal_data_do(frame, this, fd, source, healed_sinks, + locked_replies); + if (ret) + goto out; +restore_time: + afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks, + locked_replies); + + if (!is_arbiter_the_only_sink && !empty_file) { + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0, + data_lock); + if (ret < priv->child_count) { + ret = -ENOTCONN; + did_sh = _gf_false; + goto skip_undo_pending; + } + } + ret = afr_selfheal_undo_pending( + frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending, + AFR_DATA_TRANSACTION, locked_replies, data_lock); +skip_undo_pending: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock); +out: + + if (did_sh) + afr_log_selfheal(fd->inode->gfid, this, ret, "data", source, sources, + healed_sinks); + else + ret = 1; + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + + return ret; } - int -afr_sh_data_open (call_frame_t *frame, xlator_t *this) +afr_selfheal_data_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - int i = 0; - int call_count = 0; + afr_local_t *local = NULL; + int i = (long)cookie; - fd_t *fd = NULL; + local = frame->local; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + syncbarrier_wake(&local->barrier); - if (sh->healing_fd_opened) { - /* caller has opened the fd for us already, so skip open */ + return 0; +} - afr_sh_data_lock (frame, this); - return 0; +int +afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd) +{ + int ret = 0; + fd_t *fd_tmp = NULL; + loc_t loc = { + 0, + }; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + fd_tmp = fd_create(inode, 0); + if (!fd_tmp) + return -ENOMEM; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + fd_unref(fd_tmp); + goto out; + } + local = frame->local; + + AFR_ONLIST(local->child_up, frame, afr_selfheal_data_open_cbk, open, &loc, + O_RDWR | O_LARGEFILE, fd_tmp, NULL); + + ret = -ENOTCONN; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + + if (local->replies[i].op_ret < 0) { + ret = -local->replies[i].op_errno; + continue; } - call_count = afr_up_children_count (priv->child_count, local->child_up); - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if(!local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - O_RDWR|O_LARGEFILE, fd, 0); - - if (!--call_count) - break; - } - - return 0; + ret = 0; + break; + } + + if (ret < 0) { + fd_unref(fd_tmp); + goto out; + } else { + fd_bind(fd_tmp); + } + + *fd = fd_tmp; +out: + loc_wipe(&loc); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; } - int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) +afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - - - local = frame->local; - sh = &local->self_heal; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + inode_t *inode = fd->inode; + + priv = this->private; + + locked_on = alloca0(priv->child_count); + + ret = afr_selfheal_tie_breaker_inodelk(frame, this, inode, priv->sh_domain, + 0, 0, locked_on); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "self-heal as only %d number of " + "subvolumes could be locked", + uuid_utoa(fd->inode->gfid), ret); + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } - if (sh->need_data_self_heal && priv->data_self_heal) { - afr_sh_data_open (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "not doing data self heal on %s", - local->loc.path); - afr_sh_data_done (frame, this); - } + ret = __afr_selfheal_data(frame, this, fd, locked_on); + } +unlock: + afr_selfheal_uninodelk(frame, this, inode, priv->sh_domain, 0, 0, + locked_on); - return 0; + return ret; } - |
