diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-dir-read.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-read.c | 784 |
1 files changed, 265 insertions, 519 deletions
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index b48488526e5..f8bf8340dab 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -1,600 +1,346 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> +#include <string.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "checksum.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/dict.h> +#include <glusterfs/list.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> #include "afr.h" -#include "afr-self-heal.h" - - -int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - - return 0; -} - - -gf_boolean_t -__checksums_differ (uint32_t *checksum, int child_count) -{ - int ret = _gf_false; - int i = 0; - - uint32_t cksum; - - cksum = checksum[0]; - - while (i < child_count) { - if (cksum != checksum[i]) { - ret = _gf_true; - break; - } - - cksum = checksum[i]; - i++; - } - - return ret; -} - +#include "afr-transaction.h" int32_t -afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) +afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - - int child_index = 0; + afr_local_t *local = NULL; + int call_count = -1; + int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; - uint32_t entry_cksum; + local = frame->local; + fd_ctx = local->fd_ctx; + child_index = (long)cookie; - int call_count = 0; - off_t last_offset = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + LOCK(&frame->lock); + { if (op_ret == -1) { - local->op_ret = -1; - local->op_ret = op_errno; - goto out; - } - - if (op_ret == 0) - goto out; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum (entry->d_name, - strlen (entry->d_name)); - local->cont.opendir.checksum[child_index] ^= entry_cksum; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - /* read more entries */ - - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset); - -out: - if ((op_ret == 0) || (op_ret == -1)) { - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (__checksums_differ (local->cont.opendir.checksum, - priv->child_count)) { - - sh->need_entry_self_heal = _gf_true; - sh->forced_merge = _gf_true; - sh->mode = local->fd->inode->st_mode; - sh->background = _gf_false; - sh->unwind = afr_examine_dir_sh_unwind; - - gf_log (this->name, GF_LOG_DEBUG, - "checksums of directory %s differ," - " triggering forced merge", - local->loc.path); - - afr_self_heal (frame, this); - } else { - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - } - } - } + if (call_count == 0) { + afr_handle_replies_quorum(frame, this); + AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno, + local->fd, NULL); + } - return 0; + return 0; } - int -afr_examine_dir (call_frame_t *frame, xlator_t *this) +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int i; - int call_count = 0; - - local = frame->local; - priv = this->private; - - local->cont.opendir.checksum = CALLOC (priv->child_count, - sizeof (*local->cont.opendir.checksum)); - - call_count = afr_up_children_count (priv->child_count, local->child_up); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = -1; + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; - local->call_count = call_count; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->readdir, - local->fd, 131072, 0); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - if (!--call_count) - break; - } - } - - return 0; -} - - -int32_t -afr_opendir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) -{ - afr_local_t * local = NULL; - - int call_count = -1; + local->op = GF_FOP_OPENDIR; - LOCK (&frame->lock); - { - local = frame->local; + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } - if (op_ret >= 0) - local->op_ret = op_ret; + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + goto out; - call_count = afr_frame_return (frame); + loc_copy(&local->loc, loc); - if (call_count == 0) { - if ((local->op_ret == 0) && - !afr_is_opendir_done (this, fd->inode)) { + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anamolies). - */ + call_count = local->call_count; - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_opendir_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->opendir, loc, fd, NULL); - afr_examine_dir (frame, this); - - } else { - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - } - } + if (!--call_count) + break; + } + } - return 0; + return 0; +out: + AFR_STACK_UNWIND(opendir, frame, -1, op_errno, fd, NULL); + return 0; } - -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) +static int +afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int child_count = 0; - int i = 0; - - int ret = -1; - int call_count = -1; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - child_count = priv->child_count; - - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - loc_copy (&local->loc, loc); + int gen = 0; + int entry_read_subvol = 0; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + + afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable, + &gen); + + if (gen != priv->event_generation || !data_readable[par_read_subvol] || + !metadata_readable[par_read_subvol]) + return -1; + + /* Once the control reaches the following statement, it means that the + * parent's read subvol is perfectly readable. So calling + * either afr_data_subvol_get() or afr_metadata_subvol_get() would + * yield the same result. Hence, choosing afr_data_subvol_get() below. + */ + + if (!priv->consistent_metadata) + return 0; - frame->local = local; - local->fd = fd_ref (fd); + /* For an inode fetched through readdirp which is yet to be linked, + * inode ctx would not be initialised (yet). So this function returns + * -1 above due to gen being 0, which is why it is OK to pass NULL for + * read_subvol_args here. + */ + entry_read_subvol = afr_data_subvol_get(inode, this, NULL, NULL, NULL, + NULL); + if (entry_read_subvol != par_read_subvol) + return -1; + + return 0; +} - call_count = local->call_count; - - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_opendir_cbk, - priv->children[i], - priv->children[i]->fops->opendir, - loc, fd); +static void +afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) +{ + int ret = -1; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + gf_boolean_t need_heal = _gf_false; + gf_boolean_t validate_subvol = _gf_false; + + this = THIS; + priv = this->private; + + need_heal = afr_get_need_heal(this); + validate_subvol = need_heal | priv->consistent_metadata; + + list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list) + { + if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name, + frame->root->pid)) { + continue; + } - if (!--call_count) - break; - } - } + list_del_init(&entry->list); + list_add_tail(&entry->list, &entries->list); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd); - } + if (!validate_subvol) + continue; - return 0; + if (entry->inode) { + ret = afr_validate_read_subvol(entry->inode, this, subvol); + if (ret == -1) { + inode_unref(entry->inode); + entry->inode = NULL; + continue; + } + } + } } - -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: readdir - */ - int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) +afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; + afr_local_t *local = NULL; + gf_dirent_t entries; - int child_index = -1; + INIT_LIST_HEAD(&entries.list); - priv = this->private; - children = priv->children; + local = frame->local; - local = frame->local; + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; - child_index = (long) cookie; + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry->d_ino = afr_itransform (entry->d_ino, - priv->child_count, - child_index); + if (op_ret >= 0) + afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, + &entries, local->fd); - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - FREE (entry); - } - } - } + AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries); + gf_dirent_free(&entries); - return 0; + return 0; } - -int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) +int +afr_readdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - ino_t inum = 0; - - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - - int child_index = -1; - - priv = this->private; - children = priv->children; - - local = frame->local; - - child_index = (long) cookie; - - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - inum = afr_itransform (entry->d_ino, priv->child_count, - child_index); - entry->d_ino = inum; - inum = afr_itransform (entry->d_stat.st_ino, - priv->child_count, child_index); - entry->d_stat.st_ino = inum; - - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - FREE (entry); - } - } - } - - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries); - + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + priv = this->private; + local = frame->local; + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + local->op_errno = EINVAL; + local->op_ret = -1; + } + + if (subvol == -1 || !fd_ctx) { + AFR_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, 0, 0); return 0; + } + + fd_ctx->readdir_subvol = subvol; + + if (local->op == GF_FOP_READDIR) + STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, local->fd, + local->cont.readdir.size, local->cont.readdir.offset, + local->xdata_req); + else + STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, local->fd, + local->cont.readdir.size, local->cont.readdir.offset, + local->xdata_req); + return 0; } - -int32_t -afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop) +int +afr_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - - int ret = -1; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } - - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - local->cont.readdir.offset = offset; - - if (whichop == GF_FOP_READDIR) - STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdir, fd, - size, offset); - else - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, fd, - size, offset); - - op_ret = 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; + afr_fd_ctx_t *fd_ctx = NULL; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) { + op_errno = EINVAL; + goto out; + } + + local->op = whichop; + local->fd = fd_ref(fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict) ? dict_ref(dict) : NULL; + + subvol = fd_ctx->readdir_subvol; + + if (offset == 0 || subvol == -1) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn(frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_readdir_wind(frame, this, subvol); + } + + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, NULL); - } - return 0; + AFR_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL); + return 0; } - int32_t -afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR); - return 0; -} - + afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); -int32_t -afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP); - return 0; + return 0; } int32_t -afr_getdents_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dir_entry_t *entry, int32_t count) +afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - int unwind = 1; - int last_tried = -1; - int this_try = -1; - - priv = this->private; - children = priv->children; - - local = frame->local; - - if (op_ret == -1) { - last_tried = local->cont.getdents.last_tried; - - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - - this_try = ++local->cont.getdents.last_tried; - unwind = 0; - - STACK_WIND (frame, afr_getdents_cbk, - children[this_try], - children[this_try]->fops->getdents, - local->fd, local->cont.getdents.size, - local->cont.getdents.offset, local->cont.getdents.flag); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (getdents, frame, op_ret, op_errno, - entry, count); - } + afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIRP, dict); - return 0; + return 0; } - int32_t -afr_getdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int32_t flag) +afr_releasedir(xlator_t *this, fd_t *fd) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up."); - goto out; - } - - local->cont.getdents.last_tried = call_child; + afr_cleanup_fd_ctx(this, fd); - local->fd = fd_ref (fd); - - local->cont.getdents.size = size; - local->cont.getdents.offset = offset; - local->cont.getdents.flag = flag; - - frame->local = local; - - STACK_WIND (frame, afr_getdents_cbk, - children[call_child], children[call_child]->fops->getdents, - fd, size, offset, flag); - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (getdents, frame, op_ret, op_errno, - NULL, 0); - } - - return 0; + return 0; } - - |
