diff options
Diffstat (limited to 'xlators/cluster/dht/src')
23 files changed, 32583 insertions, 5988 deletions
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index f87212699c9..56f1f2ad7c8 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -1,30 +1,48 @@ +xlator_LTLIBRARIES = dht.la nufa.la switch.la -xlator_LTLIBRARIES = dht.la nufa.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster +AM_CFLAGS = -Wall $(GF_CFLAGS) +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \ - dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c +dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ + dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ + dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \ + dht-lock.c $(top_builddir)/xlators/lib/src/libxlator.c -dht_la_SOURCES = $(dht_common_source) dht.c +dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c +switch_la_SOURCES = $(dht_common_source) switch.c -dht_la_LDFLAGS = -module -avoidversion +dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = dht-common.h dht-common.c +switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \ + dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h -CLEANFILES = +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/xlators/lib/src \ + -DDATADIR=\"$(localstatedir)\" \ + -DLIBDIR=\"$(libdir)\" + +CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/distribute.so install-data-hook: - ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so
\ No newline at end of file + ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so + +if UNITTEST +CLEANFILES += *.gcda *.gcno *_xunit.xml +noinst_PROGRAMS = +TESTS = +endif diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 9254078dfe2..8ba0cc4c732 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1,36 +1,447 @@ /* - Copyright (c) 2009-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ /* TODO: add NS locking */ -#include "glusterfs.h" -#include "xlator.h" +#include "libxlator.h" #include "dht-common.h" -#include "defaults.h" +#include "dht-lock.h" +#include <glusterfs/byte-order.h> +#include <glusterfs/quota-common-utils.h> +#include <glusterfs/upcall-utils.h> +#include "glusterfs/compat-errno.h" // for ENODATA on BSD +#include <glusterfs/common-utils.h> #include <sys/time.h> +#include <libgen.h> +#include <signal.h> + +static int +dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata); + +static int +dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); + +static int +dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req); + +static int +dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this); + +static int +dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +static int +dht_rmdir_unlock(call_frame_t *frame, xlator_t *this); + +static const char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL}; + +/* Check the xdata to make sure EBADF has been set by client xlator */ +int32_t +dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno) +{ + if (op_ret == -1 && (op_errno == EBADF || op_errno == EBADFD) && + !(local->fd_checked)) { + return 1; + } + return 0; +} + +/* Sets the blocks and size values to fixed values. This is to be called + * only for dirs. The caller is responsible for checking the type + */ +int32_t +dht_set_fixed_dir_stat(struct iatt *stat) +{ + if (stat) { + stat->ia_blocks = DHT_DIR_STAT_BLOCKS; + stat->ia_size = DHT_DIR_STAT_SIZE; + return 0; + } + return -1; +} + +/* Return true if key exists in array + */ +static gf_boolean_t +dht_match_xattr(const char *key) +{ + char **xattrs_to_heal = get_xattrs_to_heal(); + + return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0; +} + +static int +dht_aggregate_quota_xattr(dict_t *dst, char *key, data_t *value) +{ + int ret = -1; + quota_meta_t *meta_dst = NULL; + quota_meta_t *meta_src = NULL; + int64_t *size = NULL; + int64_t dst_dir_count = 0; + int64_t src_dir_count = 0; + + if (value == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL, + "data value is NULL"); + ret = -1; + goto out; + } + + ret = dict_get_bin(dst, key, (void **)&meta_dst); + if (ret < 0) { + meta_dst = GF_CALLOC(1, sizeof(quota_meta_t), gf_common_quota_meta_t); + if (meta_dst == NULL) { + gf_msg("dht", GF_LOG_WARNING, ENOMEM, DHT_MSG_NO_MEMORY, + "Memory allocation failed"); + ret = -1; + goto out; + } + ret = dict_set_bin(dst, key, meta_dst, sizeof(quota_meta_t)); + if (ret < 0) { + gf_msg("dht", GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED, + "dht aggregate dict set failed"); + GF_FREE(meta_dst); + ret = -1; + goto out; + } + } + + if (value->len > sizeof(int64_t)) { + meta_src = data_to_bin(value); + + meta_dst->size = hton64(ntoh64(meta_dst->size) + + ntoh64(meta_src->size)); + meta_dst->file_count = hton64(ntoh64(meta_dst->file_count) + + ntoh64(meta_src->file_count)); + + if (value->len > (2 * sizeof(int64_t))) { + dst_dir_count = ntoh64(meta_dst->dir_count); + src_dir_count = ntoh64(meta_src->dir_count); + + if (src_dir_count > dst_dir_count) + meta_dst->dir_count = meta_src->dir_count; + } else { + meta_dst->dir_count = 0; + } + } else { + size = data_to_bin(value); + meta_dst->size = hton64(ntoh64(meta_dst->size) + ntoh64(*size)); + } + + ret = 0; +out: + return ret; +} + +static int +add_opt(char **optsp, const char *opt) +{ + char *newopts = NULL; + unsigned oldsize = 0; + unsigned newsize = 0; + + if (*optsp == NULL) + newopts = gf_strdup(opt); + else { + oldsize = strlen(*optsp); + newsize = oldsize + 1 + strlen(opt) + 1; + newopts = GF_REALLOC(*optsp, newsize); + if (newopts) + sprintf(newopts + oldsize, ",%s", opt); + } + if (newopts == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices in buffer in add_opt"); + return -1; + } + *optsp = newopts; + return 0; +} + +/* Return Choice list from Split brain status */ +static char * +getChoices(const char *value) +{ + int i = 0; + char *ptr = NULL; + char *tok = NULL; + char *result = NULL; + char *newval = NULL; + + ptr = strstr(value, "Choices:"); + if (!ptr) { + result = ptr; + goto out; + } + + newval = gf_strdup(ptr); + if (!newval) { + result = newval; + goto out; + } + + tok = strtok(newval, ":"); + if (!tok) { + result = tok; + goto out; + } + + while (tok) { + i++; + if (i == 2) + break; + tok = strtok(NULL, ":"); + } + + result = gf_strdup(tok); + +out: + if (newval) + GF_FREE(newval); + + return result; +} + +/* This function prepare a list of choices for key + (replica.split-brain-status) in case of metadata split brain + only on the basis of key-value passed to this function. + After prepare the list of choices it update the same key in dict + with this value to reflect the same in + replica.split-brain-status attr for file. + +*/ + +static int +dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value) +{ + int ret = 0; + char *oldvalue = NULL; + char *old_choice = NULL; + char *new_choice = NULL; + char *full_choice = NULL; + char *status = NULL; + + if (value == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL, + "GF_AFR_SBRAIN_STATUS value is NULL"); + ret = -1; + goto out; + } + + ret = dict_get_str(dst, key, &oldvalue); + if (ret) + goto out; + + /* skip code that is irrelevant if !oldvalue */ + if (!oldvalue) + goto out; + + if (strstr(oldvalue, "not")) { + gf_msg_debug("dht", 0, "Need to update split-brain status in dict"); + ret = -1; + goto out; + } + if (strstr(oldvalue, "metadata-split-brain:yes") && + (strstr(oldvalue, "data-split-brain:no"))) { + if (strstr(value->data, "not")) { + gf_msg_debug("dht", 0, "No need to update split-brain status"); + ret = 0; + goto out; + } + if (strstr(value->data, "yes") && + (strncmp(oldvalue, value->data, strlen(oldvalue)))) { + old_choice = getChoices(oldvalue); + if (!old_choice) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to get choices"); + ret = -1; + goto out; + } + + ret = add_opt(&full_choice, old_choice); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices"); + ret = -1; + goto out; + } + + new_choice = getChoices(value->data); + if (!new_choice) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to get choices"); + ret = -1; + goto out; + } + + ret = add_opt(&full_choice, new_choice); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices "); + ret = -1; + goto out; + } + ret = gf_asprintf(&status, + "data-split-brain:%s " + "metadata-split-brain:%s Choices:%s", + "no", "yes", full_choice); + + if (-1 == ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to prepare status "); + goto out; + } + ret = dict_set_dynstr(dst, key, status); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set full choice"); + } + } + } + +out: + if (old_choice) + GF_FREE(old_choice); + if (new_choice) + GF_FREE(new_choice); + if (full_choice) + GF_FREE(full_choice); + + return ret; +} + +static int +dht_aggregate(dict_t *this, char *key, data_t *value, void *data) +{ + dict_t *dst = NULL; + int32_t ret = -1; + data_t *dict_data = NULL; + + dst = data; + + /* compare split brain xattr only */ + if (strcmp(key, GF_AFR_SBRAIN_STATUS) == 0) { + ret = dht_aggregate_split_brain_xattr(dst, key, value); + if (!ret) + goto out; + } else if (strcmp(key, QUOTA_SIZE_KEY) == 0) { + ret = dht_aggregate_quota_xattr(dst, key, value); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, + DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED, + "Failed to aggregate quota xattr"); + } + goto out; + } else if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime(THIS, dst, key, value); + goto out; + } else { + /* compare user xattrs only */ + if (!strncmp(key, "user.", SLEN("user."))) { + ret = dict_lookup(dst, key, &dict_data); + if (!ret && dict_data && value) { + ret = is_data_equal(dict_data, value); + if (!ret) + gf_msg_debug("dht", 0, "xattr mismatch for %s", key); + } + } + } + + ret = dict_set(dst, key, value); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s", key); + } + +out: + return ret; +} + +static void +dht_aggregate_xattr(dict_t *dst, dict_t *src) +{ + if ((dst == NULL) || (src == NULL)) { + goto out; + } + + dict_foreach(src, dht_aggregate, dst); +out: + return; +} + +/* Code to save hashed subvol on inode ctx as a mds subvol + */ +int +dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + uint64_t ctx_int = 0; + gf_boolean_t ctx_free = _gf_false; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctx_int); + if (ctx_int) { + ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int; + ctx->mds_subvol = mds_subvol; + } else { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + goto unlock; + ctx->mds_subvol = mds_subvol; + ctx_free = _gf_true; + ctx_int = (long)ctx; + ret = __inode_ctx_set(inode, this, &ctx_int); + } + } +unlock: + UNLOCK(&inode->lock); + if (ret && ctx_free) + GF_FREE(ctx); + return ret; +} + +/*Code to get mds subvol from inode ctx */ + +int +dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + if (!mdsvol) + return ret; + + if (__is_root_gfid(inode->gfid)) { + (*mdsvol) = FIRST_CHILD(this); + return 0; + } + + ret = dht_inode_ctx_get(inode, this, &ctx); + + if (!ret && ctx) { + if (ctx->mds_subvol) { + *mdsvol = ctx->mds_subvol; + ret = 0; + } else { + ret = -1; + } + } + + return ret; +} /* TODO: - use volumename in xattr instead of "dht" @@ -39,3697 +450,10942 @@ - complete linkfile selfheal */ +static int +dht_lookup_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int ret = -1; -int -dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int op_ret, int op_errno) + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + + local = frame->local; + conf = this->private; + ret = op_ret; + + FRAME_SU_UNDO(frame, dht_local_t); + + if (ret == 0) { + layout = local->selfheal.layout; + ret = dht_layout_set(this, local->inode, layout); + } + + dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1); + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, &local->postparent, + 1); + } + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); + + DHT_STACK_UNWIND(lookup, frame, ret, local->op_errno, local->inode, + &local->stbuf, local->xattr, &local->postparent); + +out: + return ret; +} + +static int +dht_discover_complete(xlator_t *this, call_frame_t *discover_frame) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int ret = 0; + dht_local_t *local = NULL; + dht_local_t *heal_local = NULL; + call_frame_t *main_frame = NULL; + call_frame_t *heal_frame = NULL; + int op_errno = 0; + int ret = -1; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + uint32_t vol_commit_hash = 0; + xlator_t *source = NULL; + int heal_path = 0; + int error_while_marking_mds = 0; + int i = 0; + loc_t loc = {0}; + int8_t is_read_only = 0, layout_anomalies = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + + local = discover_frame->local; + layout = local->layout; + conf = this->private; + gf_uuid_unparse(local->gfid, gfid_local); + + LOCK(&discover_frame->lock); + { + main_frame = local->main_frame; + local->main_frame = NULL; + } + UNLOCK(&discover_frame->lock); + + if (!main_frame) + return 0; - local = frame->local; - ret = op_ret; + /* Code to update all extended attributed from + subvol to local->xattr on that internal xattr has found + */ + if (conf->subvolume_cnt == 1) + local->need_xattr_heal = 0; + if (local->need_xattr_heal && (local->mds_xattr)) { + dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr, + NULL, NULL); + dict_unref(local->mds_xattr); + local->mds_xattr = NULL; + } + + ret = dict_get_int8(local->xattr_req, QUOTA_READ_ONLY_KEY, &is_read_only); + if (ret < 0) + gf_msg_debug(this->name, 0, "key = %s not present in dict", + QUOTA_READ_ONLY_KEY); + + if (local->file_count && local->dir_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + op_errno = EIO; + goto out; + } + + if (local->cached_subvol) { + ret = dht_layout_preset(this, local->cached_subvol, local->inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SET_FAILED, + "failed to set layout for subvolume %s", + local->cached_subvol ? local->cached_subvol->name : "<nil>"); + op_errno = EINVAL; + goto out; + } + } else { + ret = dht_layout_normalize(this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ + gf_msg_debug(this->name, 0, + "normalizing failed on %s " + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", + local->loc.path, (ret < 0) ? "yes" : "no", + (ret > 0) ? ret : 0); + layout_anomalies = 1; + } else if (local->inode) { + dht_layout_set(this, local->inode, layout); + } + } - if (ret == 0) { - layout = local->selfheal.layout; - ret = inode_ctx_put (local->inode, this, - (uint64_t)(long)layout); + if (!conf->vch_forced) { + ret = dict_get_uint32(local->xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } + + if (IA_ISDIR(local->stbuf.ia_type) && !is_read_only) { + for (i = 0; i < layout->cnt; i++) { + if (!source && !layout->list[i].err) + source = layout->list[i].xlator; + if (layout->list[i].err == ENOENT || + layout->list[i].err == ESTALE) { + heal_path = 1; + } + + if (source && heal_path) + break; + } + } + + if (IA_ISDIR(local->stbuf.ia_type)) { + /* Call function to save hashed subvol on inode ctx if + internal mds xattr is not present and all subvols are up + */ + if (!local->op_ret && !__is_root_gfid(local->stbuf.ia_gfid)) + (void)dht_common_mark_mdsxattr(discover_frame, + &error_while_marking_mds, 1); + + if (local->need_xattr_heal && !heal_path) { + local->need_xattr_heal = 0; + ret = dht_dir_xattr_heal(this, local, &op_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "xattr heal failed for " + "directory gfid is %s ", + gfid_local); + } + } + } - if (ret == 0) - local->selfheal.layout = NULL; - - if (local->st_ino) { - local->stbuf.st_ino = local->st_ino; - } else { - gf_log (this->name, GF_LOG_WARNING, - "could not find hashed subvolume for %s", - local->loc.path); - } - } + if (source && (heal_path || layout_anomalies || error_while_marking_mds)) { + gf_uuid_copy(loc.gfid, local->gfid); + if (gf_uuid_is_null(loc.gfid)) { + goto done; + } - DHT_STACK_UNWIND (frame, ret, local->op_errno, local->inode, - &local->stbuf, local->xattr); + if (local->inode) + loc.inode = inode_ref(local->inode); + else + goto done; + + heal_frame = create_frame(this, this->ctx->pool); + if (heal_frame) { + heal_local = dht_local_init(heal_frame, &loc, NULL, 0); + if (!heal_local) + goto cleanup; + + gf_uuid_copy(heal_local->gfid, local->gfid); + heal_frame->cookie = source; + heal_local->xattr = dict_ref(local->xattr); + heal_local->stbuf = local->stbuf; + heal_local->postparent = local->postparent; + heal_local->inode = inode_ref(loc.inode); + heal_local->main_frame = main_frame; + FRAME_SU_DO(heal_frame, dht_local_t); + ret = synctask_new(this->ctx->env, dht_heal_full_path, + dht_heal_full_path_done, heal_frame, heal_frame); + if (!ret) { + loc_wipe(&loc); + return 0; + } + /* + * Failed to spawn the synctask. Returning + * with out doing heal. + */ + cleanup: + loc_wipe(&loc); + DHT_STACK_DESTROY(heal_frame); + } + } +done: + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); + + DHT_STACK_UNWIND(lookup, main_frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; - return 0; +out: + DHT_STACK_UNWIND(lookup, main_frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return ret; } +static int +dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = cookie; + int ret = -1; + dht_conf_t *conf = 0; + dht_layout_t *layout = NULL; + int32_t mds_heal_fresh_lookup = 0; + + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + + local = frame->local; + conf = this->private; + layout = local->selfheal.layout; + mds_heal_fresh_lookup = local->mds_heal_fresh_lookup; + + if (op_ret) { + gf_msg_debug(this->name, op_ret, + "Failed to set %s on the MDS %s for path %s. ", + conf->mds_xattr_key, prev->name, local->loc.path); + } else { + /* Save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set mds subvol on inode ctx" + " %s for %s ", + prev->name, local->loc.path); + } + } + if (!local->mds_heal_fresh_lookup && layout) { + dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffffff, + layout); + } +out: + if (mds_heal_fresh_lookup) + DHT_STACK_DESTROY(frame); + return 0; +} + +static xlator_t * +dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc) +{ + char *path = NULL; + loc_t populate_loc = { + 0, + }; + char *name = NULL; + xlator_t *hash_subvol = NULL; + + if (!inode) + return hash_subvol; + + if (loc && loc->parent && loc->path) { + if (!loc->name) { + name = strrchr(loc->path, '/'); + if (name) { + loc->name = name + 1; + } else { + goto out; + } + } + hash_subvol = dht_subvol_get_hashed(this, loc); + goto out; + } + + if (!gf_uuid_is_null(inode->gfid)) { + populate_loc.inode = inode_ref(inode); + populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL); + inode_path(populate_loc.inode, NULL, &path); + + if (!path) + goto out; + + populate_loc.path = path; + if (!populate_loc.name && populate_loc.path) { + name = strrchr(populate_loc.path, '/'); + if (name) { + populate_loc.name = name + 1; + + } else { + goto out; + } + } + hash_subvol = dht_subvol_get_hashed(this, &populate_loc); + } +out: + if (populate_loc.inode) + loc_wipe(&populate_loc); + return hash_subvol; +} +/* Common function call by revalidate/selfheal code path to populate + internal xattr if it is not present, mark_during_fresh_lookup value + determines either function is call by revalidate_cbk(discover_complete) + or call by selfheal code path while fresh lookup. + Here we do wind a call serially in case of fresh lookup and + for other lookup code path we do wind a call parallel.The reason + to wind a call serially is at the time of fresh lookup directory is not + discovered and at the time of revalidate_lookup directory is + already discovered. So, revalidate codepath can race with setxattr + codepath and can get into spurious heals because of an ongoing setxattr. + This can slow down revalidates, if healing happens in foreground. + However, if healing happens in background, there is no direct performance + penalty. +*/ int -dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) -{ - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = 0; - int is_dir = 0; - - conf = this->private; - local = frame->local; - prev = cookie; - - layout = local->layout; - - LOCK (&frame->lock); - { - /* TODO: assert equal mode on stbuf->st_mode and - local->stbuf->st_mode - - else mkdir/chmod/chown and fix - */ - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, xattr); - - if (op_ret == -1) { - local->op_errno = ENOENT; - gf_log (this->name, GF_LOG_WARNING, - "lookup of %s on %s returned error (%s)", - local->loc.path, prev->this->name, - strerror (op_errno)); - - goto unlock; - } - - is_dir = check_is_dir (inode, stbuf, xattr); - if (!is_dir) { - gf_log (this->name, GF_LOG_WARNING, - "lookup of %s on %s returned non dir 0%o", - local->loc.path, prev->this->name, - stbuf->st_mode); - local->need_selfheal = 1; - goto unlock; +dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, + int mark_during_fresh_lookup) +{ + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *hashed_subvol = NULL; + int ret = 0; + int i = 0; + dict_t *xattrs = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = { + 0, + }; + int32_t zero[1] = {0}; + dht_conf_t *conf = 0; + dht_layout_t *layout = NULL; + dht_local_t *copy_local = NULL; + call_frame_t *xattr_frame = NULL; + gf_boolean_t vol_down = _gf_false; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + local = frame->local; + conf = this->private; + layout = local->selfheal.layout; + local->mds_heal_fresh_lookup = mark_during_fresh_lookup; + + gf_uuid_unparse(local->gfid, gfid_local); + + /* Code to update hashed subvol consider as a mds subvol + and wind a setxattr call on hashed subvol to update + internal xattr + */ + if (!local->xattr || !dict_get(local->xattr, conf->mds_xattr_key)) { + /* It means no internal MDS xattr has been set yet + */ + /* Check the status of all subvol are up while call + this function call by lookup code path + */ + if (mark_during_fresh_lookup) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + vol_down = _gf_true; + break; } + } + if (vol_down) { + gf_msg_debug(this->name, 0, + "subvol %s is down. Unable to " + " save mds subvol on inode for " + " path %s gfid is %s ", + conf->subvolumes[i]->name, local->loc.path, + gfid_local); + goto out; + } + } + + /* Calculate hashed subvol based on inode and parent node + */ + hashed_subvol = dht_inode_get_hashed_subvol(local->inode, this, + &local->loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_DEBUG, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for path %s" + "gfid is %s ", + local->loc.path, gfid_local); + if (errst) + (*errst) = 1; + ret = -1; + goto out; + } + xattrs = dict_new(); + if (!xattrs) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dict_new failed"); + ret = -1; + goto out; + } + /* Add internal MDS xattr on disk for hashed subvol + */ + ret = dht_dict_set_array(xattrs, conf->mds_xattr_key, zero, 1); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary" + " value:key = %s for " + "path %s", + conf->mds_xattr_key, local->loc.path); + ret = -1; + goto out; + } + /* Create a new frame to wind a call only while + this function call by revalidate_cbk code path + To wind a call parallel need to create a new frame + */ + if (mark_during_fresh_lookup) { + xattr_frame = create_frame(this, this->ctx->pool); + if (!xattr_frame) { + ret = -1; + goto out; + } + copy_local = dht_local_init(xattr_frame, &(local->loc), NULL, 0); + if (!copy_local) { + ret = -1; + DHT_STACK_DESTROY(xattr_frame); + goto out; + } + copy_local->stbuf = local->stbuf; + copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup; + if (!copy_local->inode) + copy_local->inode = inode_ref(local->inode); + gf_uuid_copy(copy_local->loc.gfid, local->gfid); + FRAME_SU_DO(xattr_frame, dht_local_t); + STACK_WIND_COOKIE(xattr_frame, dht_common_mark_mdsxattr_cbk, + hashed_subvol, hashed_subvol, + hashed_subvol->fops->setxattr, &local->loc, + xattrs, 0, NULL); + } else { + STACK_WIND_COOKIE(frame, dht_common_mark_mdsxattr_cbk, + (void *)hashed_subvol, hashed_subvol, + hashed_subvol->fops->setxattr, &local->loc, + xattrs, 0, NULL); + } + } else { + gf_msg_debug(this->name, 0, + "internal xattr %s is present on subvol" + "on path %s gfid is %s ", + conf->mds_xattr_key, local->loc.path, gfid_local); + if (!mark_during_fresh_lookup) + dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, + 0xffffffff, layout); + } - local->op_ret = 0; - if (local->xattr == NULL) - local->xattr = dict_ref (xattr); - if (local->inode == NULL) - local->inode = inode_ref (inode); +out: + if (xattrs) + dict_unref(xattrs); + return ret; +} - dht_stat_merge (this, &local->stbuf, stbuf, prev->this); +/* Get the value of key from dict in the bytewise and save in array after + convert from network byte order to host byte order +*/ +static int32_t +dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size, + int *errst) +{ + void *ptr = NULL; + int32_t len = -1; + int32_t vindex = -1; + int32_t err = -1; + int ret = 0; + + if (dict == NULL) { + (*errst) = -1; + return -EINVAL; + } + err = dict_get_ptr_and_len(dict, key, &ptr, &len); + if (err != 0) { + (*errst) = -1; + return err; + } + + if (len != (size * sizeof(int32_t))) { + (*errst) = -1; + return -EINVAL; + } + + for (vindex = 0; vindex < size; vindex++) { + value[vindex] = ntoh32(*((int32_t *)ptr + vindex)); + if (value[vindex] < 0) + ret = -1; + } + + return ret; +} - if (prev->this == local->hashed_subvol) - local->st_ino = local->stbuf.st_ino; +static int +dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int32_t check_mds = 0; + int is_linkfile = 0; + int attempt_unwind = 0; + dht_conf_t *conf = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char gfid_node[GF_UUID_BUF_SIZE] = {0}; + int32_t mds_xattr_val[1] = {0}; + int errst = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + + /* Check if the gfid is different for file from other node */ + if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse(stbuf->ia_gfid, gfid_node); + gf_uuid_unparse(local->gfid, gfid_local); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid different on %s, gfid local = %s" + "gfid other = %s", + local->loc.path, prev->name, gfid_local, gfid_node); + } + + LOCK(&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "%s: failed to merge layouts for subvol %s", local->loc.path, + prev->name); + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_msg_debug(this->name, op_errno, + "lookup of %s on %s returned error", local->loc.path, + prev->name); + goto unlock; } -unlock: - UNLOCK (&frame->lock); + is_linkfile = check_is_linkfile(inode, stbuf, xattr, + conf->link_xattr_name); + is_dir = check_is_dir(inode, stbuf, xattr); - this_call_cnt = dht_frame_return (frame); + if (is_dir) { + local->dir_count++; + } else { + local->file_count++; + + if (!is_linkfile && !local->cached_subvol) { + /* real file */ + /* Ok, we somehow managed to find a file on + * more than one subvol. ignore this or we + * will end up overwriting information while a + * a thread is potentially unwinding from + * dht_discover_complete + */ + local->cached_subvol = prev; + attempt_unwind = 1; + } else { + goto unlock; + } + } - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { - local->need_selfheal = 0; - dht_lookup_everywhere (frame, this, &local->loc); - return 0; - } + local->op_ret = 0; - if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout); + if (local->xattr == NULL) { + local->xattr = dict_ref(xattr); + } else { + /* Don't aggregate for files. See BZ#1484709 */ + if (is_dir) + dht_aggregate_xattr(local->xattr, xattr); + } - local->layout = NULL; + if (local->inode == NULL) + local->inode = inode_ref(inode); - if (ret != 0) { - layout->gen = conf->gen; + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); - gf_log (this->name, GF_LOG_WARNING, - "fixing assignment on %s", - local->loc.path); - goto selfheal; - } - - inode_ctx_put (local->inode, this, - (uint64_t)(long)layout); - - if (local->st_ino) { - local->stbuf.st_ino = local->st_ino; - } else { - gf_log (this->name, GF_LOG_WARNING, - "could not find hashed subvol for %s", - local->loc.path); - } - } + if (!dict_get(xattr, conf->mds_xattr_key)) { + goto unlock; + } else { + gf_msg_debug(this->name, 0, + "internal xattr %s is present on subvol" + "on path %s gfid is %s ", + conf->mds_xattr_key, local->loc.path, gfid_local); + } + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + /* save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set hashed subvol for %s vol is %s", + local->loc.path, prev->name); + } - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr); + if ((check_mds < 0) && !errst) { + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "Value of %s is not zero on mds subvol" + "so xattr needs to be healed on non mds" + " path is %s and vol name is %s " + " gfid is %s", + conf->mds_xattr_key, local->loc.path, prev->name, + gfid_local); + local->need_xattr_heal = 1; + local->mds_subvol = prev; } + } +unlock: + UNLOCK(&frame->lock); +out: + /* Make sure, the thread executing dht_discover_complete is the one + * which calls STACK_DESTROY (frame). In the case of "attempt_unwind", + * this makes sure that the thread don't call dht_frame_return, till + * call to dht_discover_complete is done. + */ + if (attempt_unwind) { + dht_discover_complete(this, frame); + } - return 0; + this_call_cnt = dht_frame_return(frame); -selfheal: - ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, - &local->loc, layout); + if (is_last_call(this_call_cnt) && !attempt_unwind) { + dht_discover_complete(this, frame); + } + + if (is_last_call(this_call_cnt)) + DHT_STACK_DESTROY(frame); - return 0; + return 0; } -int -dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = -1; - int is_dir = 0; - int is_linkfile = 0; +static int +dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int ret = -EINVAL; + dht_conf_t *conf = NULL; + + conf = this->private; + if (!conf) { + goto err; + } + + if (!xattr_req) { + goto err; + } + + /* Used to check whether this is a linkto file. + */ + ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->link_xattr_name, loc->path); + goto err; + } + + /* This is used to make sure we don't unlink linkto files + * which are the target of an ongoing file migration. + */ + ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + GLUSTERFS_OPEN_FD_COUNT, loc->path); + goto err; + } + + ret = 0; +err: + return ret; +} - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - - if ((op_errno != ENOTCONN) - && (op_errno != ENOENT) - && (op_errno != ESTALE)) { - gf_log (this->name, GF_LOG_WARNING, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - } - - if (op_errno == ESTALE) { - /* propogate the ESTALE to parent. - * setting local->layout_mismatch would send - * ESTALE to parent. */ - local->layout_mismatch = 1; - } +/* This is a gfid based nameless lookup. Without a name, the hashed subvol + * cannot be calculated so a lookup is sent to all subvols. + */ +static int +dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int op_errno = EINVAL; + int i = 0; + call_frame_t *discover_frame = NULL; + + conf = this->private; + local = frame->local; + + /* As we do not know if this is a file or directory, request + * both file and directory xattrs + */ + ret = dht_set_file_xattr_req(this, loc, local->xattr_req); + if (ret) { + goto err; + } + + ret = dht_set_dir_xattr_req(this, loc, local->xattr_req); + if (ret) { + goto err; + } + + if (loc_is_root(loc)) { + /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash) + * set on the brick root. + */ + ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name, + sizeof(uint32_t)); + } + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new(this, conf->subvolume_cnt); + + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + gf_uuid_copy(local->gfid, loc->gfid); + + discover_frame = copy_frame(frame); + if (!discover_frame) { + op_errno = ENOMEM; + goto err; + } + + discover_frame->local = local; + frame->local = NULL; + local->main_frame = frame; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(discover_frame, dht_discover_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } + + return 0; - goto unlock; - } - - if (S_IFMT & (stbuf->st_mode ^ local->inode->st_mode)) { - gf_log (this->name, GF_LOG_WARNING, - "mismatching filetypes 0%o v/s 0%o for %s", - (stbuf->st_mode & S_IFMT), - (local->inode->st_mode & S_IFMT), - local->loc.path); - - local->op_ret = -1; - local->op_errno = EINVAL; - - goto unlock; - } - - layout = dht_layout_get (this, inode); - - is_dir = check_is_dir (inode, stbuf, xattr); - is_linkfile = check_is_linkfile (inode, stbuf, xattr); - - if (is_linkfile) { - gf_log (this->name, GF_LOG_WARNING, - "linkfile found in revalidate for %s", - local->loc.path); - local->layout_mismatch = 1; - - goto unlock; - } - - if (is_dir) { - ret = dht_layout_dir_mismatch (this, layout, - prev->this, &local->loc, - xattr); - if (ret != 0) { - gf_log (this->name, GF_LOG_WARNING, - "mismatching layouts for %s", - local->loc.path); - - local->layout_mismatch = 1; - - goto unlock; - } - } - - dht_stat_merge (this, &local->stbuf, stbuf, prev->this); - - local->op_ret = 0; - local->stbuf.st_ino = local->st_ino; - - if (!local->xattr) - local->xattr = dict_ref (xattr); - } -unlock: - UNLOCK (&frame->lock); - - this_call_cnt = dht_frame_return (frame); - - if (is_last_call (this_call_cnt)) { - if (!S_ISDIR (local->stbuf.st_mode) - && (local->hashed_subvol != local->cached_subvol) - && (local->stbuf.st_nlink == 1)) - local->stbuf.st_mode |= S_ISVTX; - - if (local->layout_mismatch) { - local->op_ret = -1; - local->op_errno = ESTALE; - } - - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr); - } +err: + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; + return 0; +} + +/* Code to call syntask to heal custom xattr from hashed subvol + to non hashed subvol +*/ +int +dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno) +{ + dht_local_t *copy_local = NULL; + call_frame_t *copy = NULL; + int ret = -1; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + + if (gf_uuid_is_null(local->gfid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED, + "No gfid exists for path %s " + "so healing xattr is not possible", + local->loc.path); + *op_errno = EIO; + goto out; + } + + gf_uuid_unparse(local->gfid, gfid_local); + copy = create_frame(this, this->ctx->pool); + if (copy) { + copy_local = dht_local_init(copy, &(local->loc), NULL, 0); + if (!copy_local) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "Memory allocation failed " + "for path %s gfid %s ", + local->loc.path, gfid_local); + *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } else { + copy_local->stbuf = local->stbuf; + gf_uuid_copy(copy_local->loc.gfid, local->gfid); + copy_local->mds_subvol = local->mds_subvol; + FRAME_SU_DO(copy, dht_local_t); + ret = synctask_new(this->ctx->env, dht_dir_heal_xattrs, + dht_dir_heal_xattrs_done, copy, copy); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "Synctask creation failed to heal xattr " + "for path %s gfid %s ", + local->loc.path, gfid_local); + *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } + } + } +out: + return ret; } +static int +dht_needs_selfheal(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int needs_selfheal = 0; + int ret = 0; + + local = frame->local; + layout = local->layout; + + if (local->need_attrheal || local->need_xattr_heal || + local->need_selfheal) { + needs_selfheal = 1; + } + + ret = dht_layout_normalize(this, &local->loc, layout); + + if (ret != 0) { + gf_msg_debug(this->name, 0, "fixing assignment on %s", local->loc.path); + needs_selfheal = 1; + } + return needs_selfheal; +} + +static int +is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2) +{ + if ((prot1->owner.read != prot2->owner.read) || + (prot1->owner.write != prot2->owner.write) || + (prot1->owner.exec != prot2->owner.exec) || + (prot1->group.read != prot2->group.read) || + (prot1->group.write != prot2->group.write) || + (prot1->group.exec != prot2->group.exec) || + (prot1->other.read != prot2->other.read) || + (prot1->other.write != prot2->other.write) || + (prot1->other.exec != prot2->other.exec) || + (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) || + (prot1->sticky != prot2->sticky)) { + return 1; + } else { + return 0; + } +} int -dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *stbuf) +dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) { - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; - int ret = -1; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int32_t check_mds = 0; + int errst = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char gfid_node[GF_UUID_BUF_SIZE] = {0}; + int32_t mds_xattr_val[1] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + gf_msg_debug(this->name, op_errno, + "%s: lookup on %s returned with op_ret = %d, op_errno = %d", + local->loc.path, prev->name, op_ret, op_errno); + + /* The first successful lookup*/ + if (!op_ret && gf_uuid_is_null(local->gfid)) { + memcpy(local->gfid, stbuf->ia_gfid, 16); + } + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid_local); + } + + /* Check if the gfid is different for file from other node */ + if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse(stbuf->ia_gfid, gfid_node); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid different on %s." + " gfid local = %s, gfid subvol = %s", + local->loc.path, prev->name, gfid_local, gfid_node); + } + + LOCK(&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + else mkdir/chmod/chown and fix + */ + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr); - local = frame->local; - cached_subvol = local->cached_subvol; + if (op_ret == -1) { + local->op_errno = op_errno; - ret = dht_layout_inode_set (this, local->cached_subvol, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set layout for subvolume %s", - cached_subvol ? cached_subvol->name : "<nil>"); - local->op_ret = -1; - local->op_errno = EINVAL; - goto unwind; + /* The GFID is missing on this subvol. Force a heal. */ + if (op_errno == ENODATA) { + local->need_lookup_everywhere = 1; + } + goto unlock; } - local->op_ret = 0; - if (local->stbuf.st_nlink == 1) - local->stbuf.st_mode |= S_ISVTX; + is_dir = check_is_dir(inode, stbuf, xattr); + if (!is_dir) { + gf_msg_debug(this->name, 0, + "%s: lookup on %s returned non dir 0%o" + "calling lookup_everywhere", + local->loc.path, prev->name, stbuf->ia_type); -unwind: - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr); - return 0; -} + local->need_lookup_everywhere = 1; + goto unlock; + } + local->op_ret = 0; + if (local->xattr == NULL) { + local->xattr = dict_ref(xattr); + } else { + dht_aggregate_xattr(local->xattr, xattr); + } -int -dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) -{ - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - int is_linkfile = 0; - int is_dir = 0; - xlator_t *subvol = NULL; - loc_t *loc = NULL; - xlator_t *link_subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - int ret = -1; - - conf = this->private; - - local = frame->local; - loc = &local->loc; - - prev = cookie; - subvol = prev->this; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno != ENOENT) - local->op_errno = op_errno; - goto unlock; - } - - is_linkfile = check_is_linkfile (inode, buf, xattr); - is_dir = check_is_dir (inode, buf, xattr); - - if (is_linkfile) { - link_subvol = dht_linkfile_subvol (this, inode, buf, - xattr); - gf_log (this->name, GF_LOG_WARNING, - "found on %s linkfile %s (-> %s)", - subvol->name, loc->path, - link_subvol ? link_subvol->name : "''"); - goto unlock; - } - - if (is_dir) { - local->dir_count++; - - gf_log (this->name, GF_LOG_WARNING, - "found on %s directory %s", - subvol->name, loc->path); - } else { - local->file_count++; - - if (!local->cached_subvol) { - /* found one file */ - dht_stat_merge (this, &local->stbuf, buf, - subvol); - local->xattr = dict_ref (xattr); - local->cached_subvol = subvol; - gf_log (this->name, GF_LOG_DEBUG, - "found on %s file %s", - subvol->name, loc->path); - } else { - gf_log (this->name, GF_LOG_WARNING, - "multiple subvolumes (%s and %s) have " - "file %s", local->cached_subvol->name, - subvol->name, local->loc.path); - } + if (__is_root_gfid(stbuf->ia_gfid)) { + ret = dht_dir_has_layout(xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->prebuf.ia_ctime, + local->prebuf.ia_ctime_nsec, + stbuf->ia_ctime, stbuf->ia_ctime_nsec)) { + /* Choose source */ + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + + local->prebuf.ia_ctime = stbuf->ia_ctime; + local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec; + local->prebuf.ia_prot = stbuf->ia_prot; } - } + } + } + + if (local->stbuf.ia_type != IA_INVAL) { + /* This is not the first subvol to respond + * Compare values to see if attrs need to be healed + */ + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid) || + (is_permission_different(&local->stbuf.ia_prot, + &stbuf->ia_prot))) { + local->need_attrheal = 1; + } + } + + if (local->inode == NULL) + local->inode = inode_ref(inode); + + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); + + if (!dict_get(xattr, conf->mds_xattr_key)) { + gf_msg_debug(this->name, 0, + "%s: mds xattr %s is not present " + "on %s(gfid = %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid_local); + goto unlock; + } + + /* Save the mds subvol info and stbuf. This is the value that will + * be used for healing + */ + local->mds_subvol = prev; + local->mds_stbuf = *stbuf; + + /* Save mds subvol on inode ctx */ + + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "%s: Failed to set mds (%s)", local->loc.path, prev->name); + } + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + if ((check_mds < 0) && !errst) { + /* Check if xattrs need to be healed on the directories */ + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "%s: %s is not zero on %s. Xattrs need to be healed." + "(gfid = %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid_local); + local->need_xattr_heal = 1; + } + } + unlock: - UNLOCK (&frame->lock); - - if (is_linkfile) { - gf_log (this->name, GF_LOG_WARNING, - "deleting stale linkfile %s on %s", - loc->path, subvol->name); - dht_linkfile_unlink (frame, this, subvol, loc); - } - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - hashed_subvol = local->hashed_subvol; - cached_subvol = local->cached_subvol; - - if (local->file_count && local->dir_count) { - gf_log (this->name, GF_LOG_ERROR, - "path %s is both file and directory at the " - "backend. Please fix it manually", - loc->path); - DHT_STACK_UNWIND (frame, -1, EIO, NULL, NULL, NULL); - return 0; - } + UNLOCK(&frame->lock); - if (local->dir_count) { - dht_lookup_directory (frame, this, &local->loc); - return 0; - } + this_call_cnt = dht_frame_return(frame); - if (!cached_subvol) { - DHT_STACK_UNWIND (frame, -1, ENOENT, NULL, NULL, NULL); - return 0; - } - - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "cannot create linkfile file for %s on %s: " - "hashed subvolume cannot be found.", - loc->path, cached_subvol->name); - - local->op_ret = 0; - local->op_errno = 0; - - ret = dht_layout_inode_set (frame->this, cached_subvol, - local->inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set layout for subvol %s", - cached_subvol ? cached_subvol->name : - "<nil>"); - local->op_ret = -1; - local->op_errno = EINVAL; - } + if (is_last_call(this_call_cnt)) { + /* If the mds subvol is not set correctly*/ + if (!__is_root_gfid(local->gfid) && + (!dict_get(local->xattr, conf->mds_xattr_key))) { + local->need_selfheal = 1; + } - DHT_STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, local->xattr); - return 0; - } + /* No need to call xattr heal code if volume count is 1 + */ + if (conf->subvolume_cnt == 1) { + local->need_xattr_heal = 0; + } - gf_log (this->name, GF_LOG_WARNING, - "linking file %s existing on %s to %s (hash)", - loc->path, cached_subvol->name, - hashed_subvol->name); - - dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, - cached_subvol, hashed_subvol, loc); - } + if (local->need_selfheal || local->need_lookup_everywhere) { + /* Set the gfid-req so posix will set the GFID*/ + if (!gf_uuid_is_null(local->gfid)) { + /* Ok, this should _never_ happen */ + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid, 16); + } else { + if (!gf_uuid_is_null(local->gfid_req)) + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid_req, 16); + } + } - return 0; -} + if (local->need_lookup_everywhere) { + local->need_lookup_everywhere = 0; + dht_lookup_everywhere(frame, this, &local->loc); + return 0; + } + if (local->op_ret == 0) { + if (dht_needs_selfheal(frame, this)) { + goto selfheal; + } -int -dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int i = 0; - int call_cnt = 0; + dht_layout_set(this, local->inode, layout); + if (local->inode) { + dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1); + } - conf = this->private; - local = frame->local; + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + } - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); - if (!local->inode) - local->inode = inode_ref (loc->inode); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + } - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_everywhere_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - loc, local->xattr_req); - } + return 0; - return 0; +selfheal: + FRAME_SU_DO(frame, dht_local_t); + ret = dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, &local->loc, + layout); +out: + return ret; } +static int +dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int call_cnt = 0; + int i = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, unwind); + GF_VALIDATE_OR_GOTO("dht", frame->local, unwind); + GF_VALIDATE_OR_GOTO("dht", this->private, unwind); + GF_VALIDATE_OR_GOTO("dht", loc, unwind); + + conf = this->private; + local = frame->local; + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + goto unwind; + } + + if (local->xattr != NULL) { + dict_unref(local->xattr); + local->xattr = NULL; + } + + if (!gf_uuid_is_null(local->gfid)) { + /* use this gfid in order to heal any missing ones */ + ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "%s: Failed to set dictionary value:" + " key = gfid-req", + local->loc.path); + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE( + frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req); + } + return 0; +unwind: + DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); +out: + return 0; +} int -dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) +dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) { - call_frame_t *prev = NULL; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - xlator_t *subvol = NULL; - loc_t *loc = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + int follow_link = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + uint32_t vol_commit_hash = 0; + xlator_t *subvol = NULL; + int32_t check_mds = 0; + int errst = 0, i = 0; + int32_t mds_xattr_val[1] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, err); + GF_VALIDATE_OR_GOTO("dht", frame->local, err); + GF_VALIDATE_OR_GOTO("dht", cookie, err); + GF_VALIDATE_OR_GOTO("dht", this->private, err); + + local = frame->local; + prev = cookie; + conf = this->private; + + if (!conf->vch_forced) { + /* Update the commithash value if available + */ + ret = dict_get_uint32(xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } + + gf_uuid_unparse(local->loc.gfid, gfid); - prev = cookie; - subvol = prev->this; + gf_msg_debug(this->name, op_errno, + "%s: revalidate lookup on %s returned op_ret %d", + local->loc.path, prev->name, op_ret); - local = frame->local; - loc = &local->loc; + LOCK(&frame->lock); + { + if (gf_uuid_is_null(local->gfid)) { + memcpy(local->gfid, local->loc.gfid, 16); + } if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup of %s on %s (following linkfile) failed (%s)", - local->loc.path, subvol->name, strerror (op_errno)); - goto err; - } + local->op_errno = op_errno; + + if ((op_errno != ENOTCONN) && (op_errno != ENOENT) && + (op_errno != ESTALE)) { + gf_msg(this->name, GF_LOG_INFO, op_errno, + DHT_MSG_REVALIDATE_CBK_INFO, + "Revalidate: subvolume %s for %s " + "(gfid = %s) returned -1", + prev->name, local->loc.path, gfid); + } + if (op_errno == ESTALE) { + /* propagate the ESTALE to parent. + * setting local->return_estale would send + * ESTALE to parent. */ + local->return_estale = 1; + } + + /* if it is ENOENT, we may have to do a + * 'lookup_everywhere()' to make sure + * the file is not migrated */ + if (op_errno == ENOENT) { + if (IA_ISREG(local->loc.inode->ia_type)) { + gf_msg_debug(this->name, 0, + "found ENOENT for %s. " + "Setting " + "need_lookup_everywhere" + " flag to 1", + local->loc.path); + + local->need_lookup_everywhere = 1; + } else if (IA_ISDIR(local->loc.inode->ia_type)) { + layout = local->layout; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == prev) { + layout->list[i].err = op_errno; + break; + } + } - if (check_is_dir (inode, stbuf, xattr)) { - gf_log (this->name, GF_LOG_WARNING, - "lookup of %s on %s (following linkfile) reached dir", - local->loc.path, subvol->name); - goto err; + local->need_selfheal = 1; + } + } + + /* The GFID is missing on this subvol. Lookup everywhere to force a + * gfid heal + */ + if ((op_errno == ENODATA) && + (IA_ISDIR(local->loc.inode->ia_type))) { + local->need_lookup_everywhere = 1; + } + + goto unlock; } - if (check_is_linkfile (inode, stbuf, xattr)) { - gf_log (this->name, GF_LOG_WARNING, - "lookup of %s on %s (following linkfile) reached link", - local->loc.path, subvol->name); - goto err; + if ((!IA_ISINVAL(local->inode->ia_type)) && + stbuf->ia_type != local->inode->ia_type) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "mismatching filetypes 0%o v/s 0%o for %s," + " gfid = %s", + (stbuf->ia_type), (local->inode->ia_type), local->loc.path, + gfid); + + local->op_ret = -1; + local->op_errno = EINVAL; + + goto unlock; } - if (stbuf->st_nlink == 1) - stbuf->st_mode |= S_ISVTX; - dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + layout = local->layout; - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_log (this->name, GF_LOG_ERROR, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + is_dir = check_is_dir(inode, stbuf, xattr); + is_linkfile = check_is_linkfile(inode, stbuf, xattr, + conf->link_xattr_name); + if (is_linkfile) { + follow_link = 1; + goto unlock; + } + if (is_dir) { + ret = dht_dir_has_layout(xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->prebuf.ia_ctime, + local->prebuf.ia_ctime_nsec, + stbuf->ia_ctime, stbuf->ia_ctime_nsec)) { + /* Choose source */ + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + + local->prebuf.ia_ctime = stbuf->ia_ctime; + local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec; + + if (__is_root_gfid(stbuf->ia_gfid)) + local->prebuf.ia_prot = stbuf->ia_prot; + } + } + + if (local->stbuf.ia_type != IA_INVAL) { + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid) || + is_permission_different(&local->stbuf.ia_prot, + &stbuf->ia_prot)) { + local->need_attrheal = 1; + } + } + + if (!dict_get(xattr, conf->mds_xattr_key)) { + gf_msg_debug(this->name, 0, + "%s: internal xattr %s is not present" + " on subvol %s(gfid is %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid); + } else { + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + local->mds_subvol = prev; + local->mds_stbuf.ia_gid = stbuf->ia_gid; + local->mds_stbuf.ia_uid = stbuf->ia_uid; + local->mds_stbuf.ia_prot = stbuf->ia_prot; + + /* save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set MDS subvol for %s vol is %s", + local->loc.path, prev->name); + } + if ((check_mds < 0) && !errst) { + /* Check if xattrs need to be healed on the directory + */ + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "Value of %s is not zero on " + "hashed subvol so xattr needs to" + " be healed on non hashed" + " path is %s and vol name is %s " + " gfid is %s", + conf->mds_xattr_key, local->loc.path, + prev->name, gfid); + local->need_xattr_heal = 1; + } + } + ret = dht_layout_dir_mismatch(this, layout, prev, &local->loc, + xattr); + if (ret != 0) { + /* In memory layout does not match on-disk layout. + */ + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_MISMATCH, + "Mismatching layouts for %s, gfid = %s", local->loc.path, + gfid); + + local->layout_mismatch = 1; + + goto unlock; + } + } - inode_ctx_put (inode, this, (uint64_t)(long)layout); + gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid); + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); -out: - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + local->op_ret = 0; - return 0; + if (!local->xattr) { + local->xattr = dict_ref(xattr); + } else if (is_dir) { + dht_aggregate_xattr(local->xattr, xattr); + } + } +unlock: + UNLOCK(&frame->lock); + + if (follow_link) { + /* Found a linkto file. Follow it to see if the target file exists + */ + gf_uuid_copy(local->gfid, stbuf->ia_gfid); + + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); + if (!subvol) { + op_errno = ESTALE; + local->op_ret = -1; + } else { + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, + local->xattr_req); + return 0; + } + } + + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + if (!IA_ISDIR(local->stbuf.ia_type) && + (local->hashed_subvol != local->cached_subvol) && + (local->stbuf.ia_nlink == 1) && + (conf && conf->unhashed_sticky_bit)) { + local->stbuf.ia_prot.sticky = 1; + } + /* No need to call heal code if volume count is 1 + */ + if (conf->subvolume_cnt == 1) + local->need_xattr_heal = 0; + + if (IA_ISDIR(local->stbuf.ia_type)) { + /* No mds xattr found. Trigger a heal to set it */ + if (!__is_root_gfid(local->loc.inode->gfid) && + (!dict_get(local->xattr, conf->mds_xattr_key))) + local->need_selfheal = 1; + + if (dht_needs_selfheal(frame, this)) { + if (!__is_root_gfid(local->loc.inode->gfid)) { + if (local->mds_subvol) { + local->stbuf.ia_gid = local->mds_stbuf.ia_gid; + local->stbuf.ia_uid = local->mds_stbuf.ia_uid; + local->stbuf.ia_prot = local->mds_stbuf.ia_prot; + } + } else { + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + local->stbuf.ia_prot = local->prebuf.ia_prot; + } + + layout = local->layout; + dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, + &local->loc, layout); + return 0; + } + } + + if (local->layout_mismatch) { + /* Found layout mismatch in the directory, need to + fix this in the inode context */ + dht_layout_unref(this, local->layout); + local->layout = NULL; + dht_lookup_directory(frame, this, &local->loc); + return 0; + } + + if (local->need_lookup_everywhere) { + /* As the current layout gave ENOENT error, we would + need a new layout */ + dht_layout_unref(this, local->layout); + local->layout = NULL; + + /* We know that current cached subvol is no longer + valid, get the new one */ + local->cached_subvol = NULL; + if (local->xattr_req) { + if (!gf_uuid_is_null(local->gfid)) { + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid, 16); + } + } + + dht_lookup_everywhere(frame, this, &local->loc); + return 0; + } + if (local->return_estale) { + local->op_ret = -1; + local->op_errno = ESTALE; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + + /* local->stbuf is updated only from subvols which have a layout + * The reason is to avoid choosing attr heal source from newly + * added bricks. In case e.g we have only one subvol and for + * some reason layout is not present on it, then local->stbuf + * will be EINVAL. This is an indication that the subvols + * active in the cluster do not have layouts on disk. + * Unwind with ESTALE to trigger a fresh lookup */ + if (is_dir && local->stbuf.ia_type == IA_INVAL) { + local->op_ret = -1; + local->op_errno = ESTALE; + } + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); + + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + } err: - dht_lookup_everywhere (frame, this, loc); + return ret; +} - return 0; +static int +dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cooie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + + local = frame->local; + cached_subvol = local->cached_subvol; + conf = this->private; + + gf_uuid_unparse(local->loc.gfid, gfid); + + if (local->locked) + dht_unlock_namespace(frame, &local->lock[0]); + + ret = dht_layout_preset(this, local->cached_subvol, local->loc.inode); + if (ret < 0) { + gf_msg_debug(this->name, EINVAL, + "Failed to set layout for subvolume %s, " + "(gfid = %s)", + cached_subvol ? cached_subvol->name : "<nil>", gfid); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + local->op_ret = 0; + if ((local->stbuf.ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) { + local->stbuf.ia_prot.sticky = 1; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } + +unwind: + gf_msg_debug(this->name, 0, + "creation of linkto on hashed subvol:%s, " + "returned with op_ret %d and op_errno %d: %s", + local->hashed_subvol->name, op_ret, op_errno, + uuid_utoa(local->loc.gfid)); + + if (local->linked == _gf_true) + dht_linkfile_attr_heal(frame, this); + + dht_set_fixed_dir_stat(&local->postparent); + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); +out: + return ret; } +static int +dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + const char *path = NULL; -int -dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc) + local = (dht_local_t *)frame->local; + path = local->loc.path; + FRAME_SU_UNDO(frame, dht_local_t); + + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "lookup_unlink returned with " + "op_ret -> %d and op-errno -> %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); + + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_lookup_everywhere_done(frame, this); + } + + return 0; +} + +static int +dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int call_cnt = 0; - int i = 0; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; + int this_call_cnt = 0; + dht_local_t *local = NULL; + const char *path = NULL; - conf = this->private; - local = frame->local; + local = (dht_local_t *)frame->local; + path = local->loc.path; - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - gf_log (this->name, GF_LOG_ERROR, - "memory allocation failed :("); - DHT_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } - - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); + FRAME_SU_UNDO(frame, dht_local_t); + + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "lookup_unlink returned with " + "op_ret -> %d and op-errno -> %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); + + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) { + dht_lookup_everywhere_done(frame, this); + } else { + /*When dht_lookup_everywhere is performed, one cached + *and one hashed file was found and hashed file does + *not point to the above mentioned cached node. So it + *was considered as stale and an unlink was performed. + *But unlink fails. So may be rebalance is in progress. + *now ideally we have two data-files. One obtained during + *lookup_everywhere and one where unlink-failed. So + *at this point in time we cannot decide which one to + *choose because there are chances of first cached + *file is truncated after rebalance and if it is chosen + *as cached node, application will fail. So return EIO.*/ + + if (op_errno == EBUSY) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_UNLINK_FAILED, + "Could not unlink the linkto file as " + "either fd is open and/or linkto xattr " + "is set for %s", + ((path == NULL) ? "null" : path)); + } + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); } - return 0; -} + } + return 0; +} -int -dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) +static int +dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_layout_t *layout = NULL; - char is_linkfile = 0; - char is_dir = 0; - xlator_t *subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - loc_t *loc = NULL; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + const char *path = NULL; + /* NOTE: + * If stale file unlink fails either there is an open-fd or is not an + * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten + * to ENOENT + */ - conf = this->private; + local = frame->local; - prev = cookie; - local = frame->local; - loc = &local->loc; - - if (ENTRY_MISSING (op_ret, op_errno)) { - if (conf->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } - } - - if (op_ret == 0) { - is_dir = check_is_dir (inode, stbuf, xattr); - if (is_dir) { - local->inode = inode_ref (inode); - local->xattr = dict_ref (xattr); - } - } - - if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) { - dht_lookup_directory (frame, this, &local->loc); - return 0; - } - - if (op_ret == -1) - goto out; + if (local) { + FRAME_SU_UNDO(frame, dht_local_t); + if (local->loc.path) + path = local->loc.path; + } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); - is_dir = check_is_dir (inode, stbuf, xattr); + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "Returned with op_ret %d and " + "op_errno %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); - if (!is_dir && !is_linkfile) { - /* non-directory and not a linkfile */ + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL); - dht_itransform (this, prev->this, stbuf->st_ino, - &stbuf->st_ino); + return 0; +} - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_log (this->name, GF_LOG_ERROR, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } +static int +dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict) +{ + int ret = 0; - inode_ctx_put (inode, this, (uint64_t)(long)layout); - goto out; - } + ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1); - if (is_linkfile) { - subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); - - if (!subvol) { - gf_log (this->name, GF_LOG_WARNING, - "linkfile not having link subvolume. path=%s", - loc->path); - dht_lookup_everywhere (frame, this, loc); - return 0; - } + if (ret) + return -1; - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1); + + if (ret) + return -1; + + return 0; +} + +static int32_t +dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int call_cnt = 0, ret = 0; + xlator_t *subvol = NULL; + uuid_t gfid = { + 0, + }; + char gfid_str[GF_UUID_BUF_SIZE] = {0}; + + subvol = cookie; + local = frame->local; + + if (subvol == local->hashed_subvol) { + if ((op_ret == 0) || (op_errno != ENOENT)) + local->dont_create_linkto = _gf_true; + } else { + if (gf_uuid_is_null(local->gfid)) + gf_uuid_copy(gfid, local->loc.gfid); + else + gf_uuid_copy(gfid, local->gfid); + + if ((op_ret == 0) && gf_uuid_compare(gfid, buf->ia_gfid)) { + gf_uuid_unparse(gfid, gfid_str); + gf_msg_debug(this->name, 0, + "gfid (%s) different on cached subvol " + "(%s) and looked up inode (%s), not " + "creating linkto", + uuid_utoa(buf->ia_gfid), subvol->name, gfid_str); + local->dont_create_linkto = _gf_true; + } else if (op_ret == -1) { + local->dont_create_linkto = _gf_true; + } + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + if (local->dont_create_linkto) + goto no_linkto; + else { + gf_msg_debug(this->name, 0, + "Creating linkto file on %s(hash) to " + "%s on %s (gfid = %s)", + local->hashed_subvol->name, local->loc.path, + local->cached_subvol->name, gfid_str); + + ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, + this, local->cached_subvol, + local->hashed_subvol, &local->loc); + + if (ret < 0) + goto no_linkto; } + } + + return 0; + +no_linkto: + gf_msg_debug(this->name, 0, + "skipped linkto creation (path:%s) (gfid:%s) " + "(hashed-subvol:%s) (cached-subvol:%s)", + local->loc.path, gfid_str, local->hashed_subvol->name, + local->cached_subvol->name); + + dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode, + &local->stbuf, &local->preparent, + &local->postparent, local->xattr); + return 0; +} + +static int32_t +dht_call_lookup_linkfile_create(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int i = 0; + xlator_t *subvol = NULL; + + local = frame->local; + if (gf_uuid_is_null(local->gfid)) + gf_uuid_unparse(local->loc.gfid, gfid); + else + gf_uuid_unparse(local->gfid, gfid); + + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "protecting namespace failed, skipping linkto " + "creation (path:%s)(gfid:%s)(hashed-subvol:%s)" + "(cached-subvol:%s)", + local->loc.path, gfid, local->hashed_subvol->name, + local->cached_subvol->name); + goto err; + } + + local->locked = _gf_true; + + local->call_cnt = 2; + for (i = 0; i < 2; i++) { + subvol = (subvol == NULL) ? local->hashed_subvol : local->cached_subvol; + + STACK_WIND_COOKIE(frame, dht_linkfile_create_lookup_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, NULL); + } + + return 0; + +err: + dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode, + &local->stbuf, &local->preparent, + &local->postparent, local->xattr); + return 0; +} + +/* Rebalance is performed from cached_node to hashed_node. Initial cached_node + * contains a non-linkto file. After migration it is converted to linkto and + * then unlinked. And at hashed_subvolume, first a linkto file is present, + * then after migration it is converted to a non-linkto file. + * + * Lets assume a file is present on cached subvolume and a new brick is added + * and new brick is the new_hashed subvolume. So fresh lookup on newly added + * hashed subvolume will fail and dht_lookup_everywhere gets called. If just + * before sending the dht_lookup_everywhere request rebalance is in progress, + * + * from cached subvolume it may see: Nonlinkto or linkto or No file + * from hashed subvolume it may see: No file or linkto file or non-linkto file + * + * So this boils down to 9 cases: + * at cached_subvol at hashed_subvol + * ---------------- ----------------- + * + *a) No file No file + * [request reached after [Request reached before + * migration] Migration] + * + *b) No file Linkto File + * + *c) No file Non-Linkto File + * + *d) Linkto No-File + * + *e) Linkto Linkto + * + *f) Linkto Non-Linkto + * + *g) NonLinkto No-File + * + *h) NonLinkto Linkto + * + *i) NonLinkto NonLinkto + * + * dht_lookup_everywhere_done takes decision based on any of the above case + */ + +static int +dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + dht_local_t *local = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_layout_t *layout = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + gf_boolean_t found_non_linkto_on_hashed = _gf_false; + + local = frame->local; + hashed_subvol = local->hashed_subvol; + cached_subvol = local->cached_subvol; + + gf_uuid_unparse(local->loc.gfid, gfid); + + if (local->file_count && local->dir_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "path %s (gfid = %s)exists as a file on one " + "subvolume and directory on another. " + "Please fix it manually", + local->loc.path, gfid); + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); + return 0; + } + if (local->op_ret && local->gfid_missing) { + if (gf_uuid_is_null(local->gfid_req)) { + DHT_STACK_UNWIND(lookup, frame, -1, ENODATA, NULL, NULL, NULL, + NULL); + return 0; + } + /* A hack */ + dht_lookup_directory(frame, this, &local->loc); return 0; + } -out: - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + if (local->dir_count) { + dht_lookup_directory(frame, this, &local->loc); return 0; -} + } + + gf_msg_debug(this->name, 0, + "STATUS: hashed_subvol %s " + "cached_subvol %s", + (hashed_subvol == NULL) ? "null" : hashed_subvol->name, + (cached_subvol == NULL) ? "null" : cached_subvol->name); + + if (!cached_subvol) { + if (local->skip_unlink.handle_valid_link && hashed_subvol) { + /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK": + * If this lookup is performed by rebalance and this + * rebalance process detected hashed file and by + * the time it sends the lookup request to cached node, + * file got migrated and now at initial hashed_node, + * final migrated file is present. With current logic, + * because this process fails to find the cached_node, + * it will unlink the file at initial hashed_node. + * + * So we avoid this by setting key, and checking at the + * posix_unlink that unlink the file only if file is a + * linkto file and not a migrated_file. + */ + + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file( + local->xattr_req); + + if (ret) { + /* If for some reason, setting key in the dict + * fails, return with ENOENT, as with respect to + * this process, it detected only a stale link + * file. + * + * Next lookup will delete it. + * + * Performing deletion of stale link file when + * setting key in dict fails, may cause the data + * loss because of the above mentioned race. + */ + + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, + NULL); + } else { + local->skip_unlink.handle_valid_link = _gf_false; + + gf_msg_debug(this->name, 0, + "No Cached was found and " + "unlink on hashed was skipped" + " so performing now: %s", + local->loc.path); + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND(frame, dht_lookup_unlink_stale_linkto_cbk, + hashed_subvol, hashed_subvol->fops->unlink, + &local->loc, 0, local->xattr_req); + } + } else { + gf_msg_debug(this->name, 0, + "There was no cached file and " + "unlink on hashed is not skipped %s", + local->loc.path); -int -dht_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - xlator_t *subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - conf = this->private; - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "memory allocation failed :("); - goto err; - } - - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "copying location failed for path=%s", - loc->path); - goto err; + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL); + } + return 0; + } + + /* At the time of dht_lookup, no file was found on hashed and that is + * why dht_lookup_everywhere is called, but by the time + * dht_lookup_everywhere + * reached to server, file might have already migrated. In that case we + * will find a migrated file at the hashed_node. In this case store the + * layout in context and return successfully. + */ + + if (hashed_subvol || local->need_lookup_everywhere) { + if (local->need_lookup_everywhere) { + found_non_linkto_on_hashed = _gf_true; + + } else if ((local->file_count == 1) && + (hashed_subvol == cached_subvol)) { + gf_msg_debug(this->name, 0, + "found cached file on hashed subvolume " + "so store in context and return for %s", + local->loc.path); + + found_non_linkto_on_hashed = _gf_true; } - - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); - } else { - local->xattr_req = dict_new (); - } - hashed_subvol = dht_subvol_get_hashed (this, loc); - cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (found_non_linkto_on_hashed) + goto preset_layout; + } - local->cached_subvol = cached_subvol; - local->hashed_subvol = hashed_subvol; + if (hashed_subvol) { + if (local->skip_unlink.handle_valid_link == _gf_true) { + if (cached_subvol == local->skip_unlink.hash_links_to) { + if (gf_uuid_compare(local->skip_unlink.cached_gfid, + local->skip_unlink.hashed_gfid)) { + /*GFID different, return error*/ + DHT_STACK_UNWIND(lookup, frame, -1, ESTALE, NULL, NULL, + NULL, NULL); - if (is_revalidate (loc)) { - layout = dht_layout_get (this, loc->inode); + return 0; + } - if (!layout) { - gf_log (this->name, GF_LOG_ERROR, - "revalidate without cache. path=%s", - loc->path); - op_errno = EINVAL; - goto err; + ret = dht_layout_preset(this, cached_subvol, local->loc.inode); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, 0, + DHT_MSG_LAYOUT_PRESET_FAILED, + "Could not set pre-set layout " + "for subvolume %s", + cached_subvol->name); } - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_DEBUG, - "incomplete layout failure for path=%s", - loc->path); - op_errno = ESTALE; - goto err; - } - - local->inode = inode_ref (loc->inode); - local->st_ino = loc->inode->ino; - - local->call_cnt = layout->cnt; - call_cnt = local->call_cnt; - - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, - * revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - loc, local->xattr_req); - - if (!--call_cnt) - break; - } - } else { - /* TODO: remove the hard-coding */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); - - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_ERROR, - "no subvolume in layout for path=%s, " - "checking on all the subvols to see if " - "it is a directory", loc->path); - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "memory allocation failed :("); - goto err; - } - - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - return 0; + local->op_ret = (ret == 0) ? ret : -1; + local->op_errno = (ret == 0) ? ret : EINVAL; + + /* Presence of local->cached_subvol validates + * that lookup from cached node is successful + */ + + if (!local->op_ret && local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); } - STACK_WIND (frame, dht_lookup_cbk, - hashed_subvol, hashed_subvol->fops->lookup, - loc, local->xattr_req); + gf_msg_debug(this->name, 0, + "Skipped unlinking linkto file " + "on the hashed subvolume. " + "Returning success as it is a " + "valid linkto file. Path:%s", + local->loc.path); + + goto unwind_hashed_and_cached; + } else { + local->skip_unlink.handle_valid_link = _gf_false; + + gf_msg_debug(this->name, 0, + "Linkto file found on hashed " + "subvol " + "and data file found on cached " + "subvolume. But linkto points to " + "different cached subvolume (%s) " + "path %s", + (local->skip_unlink.hash_links_to + ? local->skip_unlink.hash_links_to->name + : " <nil>"), + local->loc.path); + + if (local->skip_unlink.opend_fd_count == 0) { + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file( + local->xattr_req); + + if (ret) { + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, + NULL, NULL); + } else { + local->call_cnt = 1; + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND(frame, dht_lookup_unlink_of_false_linkto_cbk, + hashed_subvol, hashed_subvol->fops->unlink, + &local->loc, 0, local->xattr_req); + } + + return 0; + } + } } + } + +preset_layout: + if (found_non_linkto_on_hashed) { + if (local->need_lookup_everywhere) { + if (gf_uuid_compare(local->gfid, local->inode->gfid)) { + /* GFID different, return error */ + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, + NULL); + return 0; + } + } + + local->op_ret = 0; + local->op_errno = 0; + layout = dht_layout_for_subvol(this, cached_subvol); + if (!layout) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "%s: no pre-set layout for subvolume %s," + " gfid = %s", + local->loc.path, + (cached_subvol ? cached_subvol->name : "<nil>"), gfid); + } + + ret = dht_layout_set(this, local->inode, layout); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "%s: failed to set layout for subvol %s, " + "gfid = %s", + local->loc.path, + (cached_subvol ? cached_subvol->name : "<nil>"), gfid); + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, + "Cannot create linkfile for %s on %s: " + "hashed subvolume cannot be found, gfid = %s.", + local->loc.path, cached_subvol->name, gfid); + + local->op_ret = 0; + local->op_errno = 0; + + ret = dht_layout_preset(frame->this, cached_subvol, local->inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED, + "Failed to set layout for subvol %s" + ", gfid = %s", + cached_subvol ? cached_subvol->name : "<nil>", gfid); + local->op_ret = -1; + local->op_errno = EINVAL; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); return 0; + } + + if (frame->root->op != GF_FOP_RENAME) { + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, &local->loc, hashed_subvol, + &local->current->ns, + dht_call_lookup_linkfile_create); + } else { + gf_msg_debug(this->name, 0, + "Creating linkto file on %s(hash) to %s on %s " + "(gfid = %s)", + hashed_subvol->name, local->loc.path, cached_subvol->name, + gfid); + + ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, this, + cached_subvol, hashed_subvol, &local->loc); + } + + return ret; + +unwind_hashed_and_cached: + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; } +static int +dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + int is_linkfile = 0; + int is_dir = 0; + loc_t *loc = NULL; + xlator_t *link_subvol = NULL; + int ret = -1; + int32_t fd_count = 0; + dht_conf_t *conf = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + dict_t *dict_req = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + + local = frame->local; + loc = &local->loc; + conf = this->private; + + prev = cookie; + + gf_msg_debug(this->name, 0, + "returned with op_ret %d and op_errno %d (%s) " + "from subvol %s", + op_ret, op_errno, loc->path, prev->name); + + LOCK(&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) + local->op_errno = op_errno; + if (op_errno == ENODATA) + local->gfid_missing = _gf_true; + goto unlock; + } -int -dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct stat *stbuf) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } - - dht_stat_merge (this, &local->stbuf, stbuf, prev->this); - - if (local->inode) - local->stbuf.st_ino = local->inode->ino; - local->op_ret = 0; - } + if (gf_uuid_is_null(local->gfid)) + gf_uuid_copy(local->gfid, buf->ia_gfid); + + gf_uuid_unparse(local->gfid, gfid); + + if (gf_uuid_compare(local->gfid, buf->ia_gfid)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid differs on subvolume %s," + " gfid local = %s, gfid node = %s", + loc->path, prev->name, gfid, uuid_utoa(buf->ia_gfid)); + } + + is_linkfile = check_is_linkfile(inode, buf, xattr, + conf->link_xattr_name); + + if (is_linkfile) { + link_subvol = dht_linkfile_subvol(this, inode, buf, xattr); + gf_msg_debug(this->name, 0, "found on %s linkfile %s (-> %s)", + prev->name, loc->path, + link_subvol ? link_subvol->name : "''"); + goto unlock; + } + + is_dir = check_is_dir(inode, buf, xattr); + + /* non linkfile GFID takes precedence but don't overwrite + gfid if we have already found a cached file*/ + if (!local->cached_subvol) + gf_uuid_copy(local->gfid, buf->ia_gfid); + + if (is_dir) { + local->dir_count++; + + gf_msg_debug(this->name, 0, "found on %s directory %s", prev->name, + loc->path); + } else { + local->file_count++; + + gf_msg_debug(this->name, 0, "found cached file on %s for %s", + prev->name, loc->path); + + if (!local->cached_subvol) { + /* found one file */ + dht_iatt_merge(this, &local->stbuf, buf); + + local->xattr = dict_ref(xattr); + local-> |
