diff options
Diffstat (limited to 'xlators/cluster/dht')
| -rw-r--r-- | xlators/cluster/dht/src/Makefile.am | 26 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 3988 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 678 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-diskusage.c | 520 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-hashfn.c | 96 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 780 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-inode-read.c | 1139 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-inode-write.c | 1013 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-layout.c | 249 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-linkfile.c | 253 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-mem-types.h | 25 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 1815 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rename.c | 397 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 624 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 758 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht.c | 536 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/nufa.c | 436 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/switch.c | 350 |
18 files changed, 9905 insertions, 3778 deletions
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index 8ebcab044..174bea841 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -2,32 +2,34 @@ xlator_LTLIBRARIES = dht.la nufa.la switch.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster +dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ + dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ + dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \ - dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ - $(top_builddir)/xlators/lib/src/libxlator.c - -dht_la_SOURCES = $(dht_common_source) dht.c +dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c switch_la_SOURCES = $(dht_common_source) switch.c -dht_la_LDFLAGS = -module -avoidversion +dht_la_LDFLAGS = -module -avoid-version dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LDFLAGS = -module -avoid-version nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -switch_la_LDFLAGS = -module -avoidversion +switch_la_LDFLAGS = -module -avoid-version switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = dht-common.h dht-common.c dht-mem-types.h $(top_builddir)/xlators/lib/src/libxlator.h +noinst_HEADERS = dht-common.h dht-mem-types.h \ + $(top_builddir)/xlators/lib/src/libxlator.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/xlators/lib/src -CLEANFILES = +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/distribute.so diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 6a8455b51..8f61339e6 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2009-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -31,17 +22,18 @@ #include "dht-common.h" #include "defaults.h" #include "byte-order.h" +#include "glusterfs-acl.h" #include <sys/time.h> #include <libgen.h> - -void +int dht_aggregate (dict_t *this, char *key, data_t *value, void *data) { dict_t *dst = NULL; int64_t *ptr = 0, *size = NULL; int32_t ret = -1; + data_t *dict_data = NULL; dst = data; @@ -53,27 +45,46 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data) if (size == NULL) { gf_log ("dht", GF_LOG_WARNING, "memory allocation failed"); - return; + return -1; } ret = dict_set_bin (dst, key, size, sizeof (int64_t)); if (ret < 0) { gf_log ("dht", GF_LOG_WARNING, "dht aggregate dict set failed"); GF_FREE (size); - return; + return -1; } } ptr = data_to_bin (value); if (ptr == NULL) { gf_log ("dht", GF_LOG_WARNING, "data to bin failed"); - return; + return -1; } *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + + } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime (THIS, dst, key, value); + if (ret < 0) + return ret; + } else { + /* compare user xattrs only */ + if (!strncmp (key, "user.", strlen ("user."))) { + ret = dict_lookup (dst, key, &dict_data); + if (!ret && dict_data && value) { + ret = is_data_equal (dict_data, value); + if (!ret) + gf_log ("dht", GF_LOG_DEBUG, + "xattr mismatch for %s", key); + } + } + ret = dict_set (dst, key, value); + if (ret) + gf_log ("dht", GF_LOG_WARNING, "xattr dict set failed"); } - return; + return 0; } @@ -100,7 +111,7 @@ out: int dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -118,20 +129,14 @@ dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, if (ret == 0) { layout = local->selfheal.layout; ret = dht_layout_set (this, local->inode, layout); + } - if (local->ia_ino) { - local->stbuf.ia_ino = local->ia_ino; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "could not find hashed subvolume for %s", - local->loc.path); - } - - if (local->loc.parent) - local->postparent.ia_ino = local->loc.parent->ino; + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); } - WIPE (&local->postparent); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -142,18 +147,97 @@ out: int -dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) +dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) +{ + dht_local_t *local = NULL; + call_frame_t *main_frame = NULL; + int op_errno = 0; + int ret = -1; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + + local = discover_frame->local; + layout = local->layout; + conf = this->private; + + LOCK(&discover_frame->lock); + { + main_frame = local->main_frame; + local->main_frame = NULL; + } + UNLOCK(&discover_frame->lock); + + if (!main_frame) + return 0; + + if (local->file_count && local->dir_count) { + gf_log (this->name, GF_LOG_ERROR, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + op_errno = EIO; + goto out; + } + + if (local->cached_subvol) { + ret = dht_layout_preset (this, local->cached_subvol, + local->inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set layout for subvolume %s", + local->cached_subvol ? local->cached_subvol->name : "<nil>"); + op_errno = EINVAL; + goto out; + } + } else { + ret = dht_layout_normalize (this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ + gf_log (this->name, GF_LOG_DEBUG, + "normalizing failed on %s " + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", local->loc.path, + (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); + if ((ret > 0) && (ret == conf->subvolume_cnt)) { + op_errno = ESTALE; + goto out; + } + } + + if (local->inode) + dht_layout_set (this, local->inode, layout); + } + + DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; +out: + DHT_STACK_UNWIND (lookup, main_frame, -1, op_errno, NULL, NULL, NULL, + NULL); + + return ret; +} + + +int +dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - dht_conf_t *conf = NULL; dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; dht_layout_t *layout = NULL; int ret = -1; int is_dir = 0; + int is_linkfile = 0; + int attempt_unwind = 0; + dht_conf_t *conf = 0; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -161,15 +245,12 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, GF_VALIDATE_OR_GOTO ("dht", this->private, out); GF_VALIDATE_OR_GOTO ("dht", cookie, out); - conf = this->private; local = frame->local; prev = cookie; + conf = this->private; layout = local->layout; - if (!op_ret && uuid_is_null (local->gfid)) - memcpy (local->gfid, stbuf->ia_gfid, 16); - /* Check if the gfid is different for file from other node */ if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) { gf_log (this->name, GF_LOG_WARNING, @@ -177,6 +258,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, prev->this->name); } + LOCK (&frame->lock); { /* TODO: assert equal mode on stbuf->st_mode and @@ -186,9 +268,12 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, */ ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, xattr); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); if (op_ret == -1) { - local->op_errno = ENOENT; + local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned error (%s)", local->loc.path, prev->this->name, @@ -197,17 +282,26 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); - if (!is_dir) { - gf_log (this->name, GF_LOG_DEBUG, - "lookup of %s on %s returned non dir 0%o", - local->loc.path, prev->this->name, - stbuf->ia_type); - local->need_selfheal = 1; - goto unlock; + + if (is_dir) { + local->dir_count ++; + } else { + local->file_count ++; + + if (!is_linkfile) { + /* real file */ + local->cached_subvol = prev->this; + attempt_unwind = 1; + } else { + goto unlock; + } } local->op_ret = 0; + if (local->xattr == NULL) { local->xattr = dict_ref (xattr); } else { @@ -220,73 +314,94 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); dht_iatt_merge (this, &local->postparent, postparent, prev->this); - - if (!local->ia_ino && - (prev->this == dht_first_up_subvol (this))) { - local->ia_ino = local->stbuf.ia_ino; - } - } unlock: UNLOCK (&frame->lock); +out: + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt) || attempt_unwind) { + dht_discover_complete (this, frame); + } - this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_DESTROY (frame); - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { - local->need_selfheal = 0; - dht_lookup_everywhere (frame, this, &local->loc); - return 0; - } + return 0; +} - if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "fixing assignment on %s", - local->loc.path); - goto selfheal; - } +int +dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int op_errno = EINVAL; + int i = 0; + call_frame_t *discover_frame = NULL; - dht_layout_set (this, local->inode, layout); + conf = this->private; + local = frame->local; - if (local->ia_ino) { - local->stbuf.ia_ino = local->ia_ino; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "could not find hashed subvol for %s", - local->loc.path); - } + ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + loc->path, conf->xattr_name); - if (local->loc.parent) - local->postparent.ia_ino = - local->loc.parent->ino; - } + ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + loc->path, conf->link_xattr_name); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + uuid_copy (local->gfid, loc->gfid); + + discover_frame = copy_frame (frame); + if (!discover_frame) { + op_errno = ENOMEM; + goto err; + } + + discover_frame->local = local; + frame->local = NULL; + local->main_frame = frame; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (discover_frame, dht_discover_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); } return 0; -selfheal: - FRAME_SU_DO (frame, dht_local_t); - ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, - &local->loc, layout); -out: - return ret; +err: + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); + + return 0; } + int -dht_lookup_root_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) +dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - dht_conf_t *conf = NULL; dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; @@ -300,47 +415,64 @@ dht_lookup_root_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, GF_VALIDATE_OR_GOTO ("dht", this->private, out); GF_VALIDATE_OR_GOTO ("dht", cookie, out); - conf = this->private; local = frame->local; prev = cookie; layout = local->layout; + if (!op_ret && uuid_is_null (local->gfid)) + memcpy (local->gfid, stbuf->ia_gfid, 16); + + /* Check if the gfid is different for file from other node */ + if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid different on %s", + local->loc.path, prev->this->name); + } + LOCK (&frame->lock); { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, xattr); if (op_ret == -1) { local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, + gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned error (%s)", local->loc.path, prev->this->name, strerror (op_errno)); + goto unlock; } is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir) { - gf_log (this->name, GF_LOG_CRITICAL, + gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned non dir 0%o", local->loc.path, prev->this->name, stbuf->ia_type); + local->need_selfheal = 1; goto unlock; } local->op_ret = 0; - if (local->xattr == NULL) + if (local->xattr == NULL) { local->xattr = dict_ref (xattr); + } else { + dht_aggregate_xattr (local->xattr, xattr); + } + if (local->inode == NULL) local->inode = inode_ref (inode); dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - - if (prev->this == dht_first_up_subvol (this)) { - local->ia_ino = local->stbuf.ia_ino; - } - + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); } unlock: UNLOCK (&frame->lock); @@ -349,83 +481,45 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { + if (local->need_selfheal) { + local->need_selfheal = 0; + dht_lookup_everywhere (frame, this, &local->loc); + return 0; + } + if (local->op_ret == 0) { ret = dht_layout_normalize (this, &local->loc, layout); + if (ret != 0) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "fixing assignment on %s", local->loc.path); + goto selfheal; } - ret = dht_layout_set (this, local->inode, layout); + dht_layout_set (this, local->inode, layout); } + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); } -out: - return ret; -} - -static int -dht_do_fresh_lookup_on_root (xlator_t *this, call_frame_t *frame) -{ - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int call_cnt = 0; - int i = 0; - int op_errno = EINVAL; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame, unwind); - GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind); - - local = frame->local; - conf = this->private; - if (!conf) - goto err; - - if (local->layout) { - dht_layout_unref (this, local->layout); - local->layout = NULL; - } - - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set the dict entry for dht"); - - call_cnt = local->call_cnt = conf->subvolume_cnt; - - local->layout = dht_layout_new (this, - conf->subvolume_cnt); - if (!local->layout) { - local->op_errno = ENOMEM; - goto err; - } - - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_root_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - return 0; -err: - DHT_STACK_UNWIND (lookup, frame, -1, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); - return 0; -unwind: - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); +selfheal: + FRAME_SU_DO (frame, dht_local_t); + uuid_copy (local->loc.gfid, local->gfid); + ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, + &local->loc, layout); out: - return -1; + return ret; } int @@ -442,7 +536,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; int is_dir = 0; int is_linkfile = 0; - unsigned char root_gfid[16] = {0,}; + call_frame_t *copy = NULL; + dht_local_t *copy_local = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -469,12 +564,20 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, strerror (op_errno)); } if (op_errno == ESTALE) { - /* propogate the ESTALE to parent. - * setting local->layout_mismatch would send + /* propagate the ESTALE to parent. + * setting local->return_estale would send * ESTALE to parent. */ - local->layout_mismatch = 1; + local->return_estale = 1; } + /* if it is ENOENT, we may have to do a + * 'lookup_everywhere()' to make sure + * the file is not migrated */ + if (op_errno == ENOENT) { + if (IA_ISREG (local->loc.inode->ia_type)) { + local->need_lookup_everywhere = 1; + } + } goto unlock; } @@ -493,18 +596,36 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, layout = local->layout; is_dir = check_is_dir (inode, stbuf, xattr); - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); if (is_linkfile) { gf_log (this->name, GF_LOG_INFO, "linkfile found in revalidate for %s", local->loc.path); - local->layout_mismatch = 1; + local->return_estale = 1; goto unlock; } if (is_dir) { + ret = dht_dir_has_layout (xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->stbuf.ia_ctime, + local->stbuf.ia_ctime_nsec, + stbuf->ia_ctime, + stbuf->ia_ctime_nsec)) { + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + } + } + if (local->stbuf.ia_type != IA_INVAL) + { + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid)) { + local->need_selfheal = 1; + } + } ret = dht_layout_dir_mismatch (this, layout, prev->this, &local->loc, xattr); @@ -524,10 +645,6 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev->this); local->op_ret = 0; - local->stbuf.ia_ino = local->ia_ino; - - if (local->loc.parent) - local->postparent.ia_ino = local->loc.parent->ino; if (!local->xattr) { local->xattr = dict_ref (xattr); @@ -547,24 +664,59 @@ out: && (conf && conf->unhashed_sticky_bit)) { local->stbuf.ia_prot.sticky = 1; } + if (local->need_selfheal) { + local->need_selfheal = 0; + uuid_copy (local->gfid, local->stbuf.ia_gfid); + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + copy = create_frame (this, this->ctx->pool); + if (copy) { + copy_local = dht_local_init (copy, &local->loc, + NULL, 0); + if (!copy_local) + goto cont; + copy_local->stbuf = local->stbuf; + copy->local = copy_local; + FRAME_SU_DO (copy, dht_local_t); + ret = synctask_new (this->ctx->env, + dht_dir_attr_heal, + dht_dir_attr_heal_done, + copy, copy); + } + } +cont: + if (local->layout_mismatch) { + /* Found layout mismatch in the directory, need to + fix this in the inode context */ + dht_layout_unref (this, local->layout); + local->layout = NULL; + dht_lookup_directory (frame, this, &local->loc); + return 0; + } - if (local->layout_mismatch) { + if (local->need_lookup_everywhere) { + /* As the current layout gave ENOENT error, we would + need a new layout */ + dht_layout_unref (this, local->layout); + local->layout = NULL; + + /* We know that current cached subvol is no more + valid, get the new one */ + local->cached_subvol = NULL; + dht_lookup_everywhere (frame, this, &local->loc); + return 0; + } + if (local->return_estale) { local->op_ret = -1; local->op_errno = ESTALE; - - /* Because for 'root' inode, there is no FRESH lookup - * sent from FUSE layer upon ESTALE, we need to handle - * that one case here */ - root_gfid[15] = 1; - if (!local->loc.parent && - !uuid_compare (local->loc.inode->gfid, root_gfid)) { - dht_do_fresh_lookup_on_root (this, frame); - return 0; - } } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -580,7 +732,8 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -613,12 +766,16 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, local->stbuf.ia_prot.sticky = 1; } - if (local->loc.parent) - local->postparent.ia_ino = local->loc.parent->ino; + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } unwind: - WIPE (&local->postparent); + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -628,12 +785,149 @@ out: int +dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + dht_local_t *local = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + hashed_subvol = local->hashed_subvol; + cached_subvol = local->cached_subvol; + + if (local->file_count && local->dir_count) { + gf_log (this->name, GF_LOG_ERROR, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL, + NULL); + return 0; + } + + if (local->dir_count) { + dht_lookup_directory (frame, this, &local->loc); + return 0; + } + + if (!cached_subvol) { + DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL, + NULL); + return 0; + } + + if (local->need_lookup_everywhere) { + if (uuid_compare (local->gfid, local->inode->gfid)) { + /* GFID different, return error */ + DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, + NULL, NULL, NULL); + return 0; + } + local->op_ret = 0; + local->op_errno = 0; + layout = dht_layout_for_subvol (this, cached_subvol); + if (!layout) { + gf_log (this->name, GF_LOG_INFO, + "%s: no pre-set layout for subvolume %s", + local->loc.path, (cached_subvol ? + cached_subvol->name : + "<nil>")); + } + + ret = dht_layout_set (this, local->inode, layout); + if (ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "%s: failed to set layout for subvol %s", + local->loc.path, (cached_subvol ? + cached_subvol->name : + "<nil>")); + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (lookup, frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, local->xattr, + &local->postparent); + return 0; + } + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_INFO, + "cannot create linkfile file for %s on %s: " + "hashed subvolume cannot be found.", + local->loc.path, cached_subvol->name); + + local->op_ret = 0; + local->op_errno = 0; + + ret = dht_layout_preset (frame->this, cached_subvol, + local->inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "failed to set layout for subvol %s", + cached_subvol ? cached_subvol->name : + "<nil>"); + local->op_ret = -1; + local->op_errno = EINVAL; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (lookup, frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, local->xattr, + &local->postparent); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "linking file %s existing on %s to %s (hash)", + local->loc.path, cached_subvol->name, + hashed_subvol->name); + + ret = dht_linkfile_create (frame, + dht_lookup_linkfile_create_cbk, this, + cached_subvol, hashed_subvol, &local->loc); + + return ret; +} + + +int +dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int this_call_cnt = 0; + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + dht_lookup_everywhere_done (frame, this); + } + + return 0; +} + + +int dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr, struct iatt *postparent) { - dht_conf_t *conf = NULL; dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; @@ -642,9 +936,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *subvol = NULL; loc_t *loc = NULL; xlator_t *link_subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - int ret = -1; + int ret = -1; + int32_t fd_count = 0; + dht_conf_t *conf = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -652,10 +946,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, GF_VALIDATE_OR_GOTO ("dht", cookie, out); GF_VALIDATE_OR_GOTO ("dht", this->private, out); - conf = this->private; - local = frame->local; loc = &local->loc; + conf = this->private; prev = cookie; subvol = prev->this; @@ -667,8 +960,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; goto unlock; } + if (uuid_is_null (local->gfid)) - memcpy (local->gfid, buf->ia_gfid, 16); + uuid_copy (local->gfid, buf->ia_gfid); if (uuid_compare (local->gfid, buf->ia_gfid)) { gf_log (this->name, GF_LOG_WARNING, @@ -676,7 +970,8 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc->path, prev->this->name); } - is_linkfile = check_is_linkfile (inode, buf, xattr); + is_linkfile = check_is_linkfile (inode, buf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, buf, xattr); if (is_linkfile) { @@ -689,6 +984,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } + /* non linkfile GFID takes precedence */ + uuid_copy (local->gfid, buf->ia_gfid); + if (is_dir) { local->dir_count++; @@ -714,7 +1012,7 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, /* This is where we need 'rename' both entries logic */ gf_log (this->name, GF_LOG_WARNING, "multiple subvolumes (%s and %s) have " - "file %s (preferrably rename the file " + "file %s (preferably rename the file " "in the backend, and do a fresh lookup)", local->cached_subvol->name, subvol->name, local->loc.path); @@ -725,80 +1023,22 @@ unlock: UNLOCK (&frame->lock); if (is_linkfile) { - gf_log (this->name, GF_LOG_INFO, - "deleting stale linkfile %s on %s", - loc->path, subvol->name); - dht_linkfile_unlink (frame, this, subvol, loc); - } - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - hashed_subvol = local->hashed_subvol; - cached_subvol = local->cached_subvol; - - if (local->file_count && local->dir_count) { - gf_log (this->name, GF_LOG_ERROR, - "path %s exists as a file on one subvolume " - "and directory on another. " - "Please fix it manually", - loc->path); - DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL, - NULL); - return 0; - } - - if (local->dir_count) { - dht_lookup_directory (frame, this, &local->loc); - return 0; - } - - if (!cached_subvol) { - DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL, - NULL); - return 0; - } - - if (!hashed_subvol) { + ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count); + /* Delete the linkfile only if there are no open fds on it. + if there is a open-fd, it may be in migration */ + if (!ret && (fd_count == 0)) { gf_log (this->name, GF_LOG_INFO, - "cannot create linkfile file for %s on %s: " - "hashed subvolume cannot be found.", - loc->path, cached_subvol->name); - - local->op_ret = 0; - local->op_errno = 0; - - ret = dht_layout_preset (frame->this, cached_subvol, - local->inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "failed to set layout for subvol %s", - cached_subvol ? cached_subvol->name : - "<nil>"); - local->op_ret = -1; - local->op_errno = EINVAL; - } - - if (local->loc.parent) - local->postparent.ia_ino = - local->loc.parent->ino; - - WIPE (&local->postparent); - - DHT_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, local->xattr, - &local->postparent); + "deleting stale linkfile %s on %s", + loc->path, subvol->name); + STACK_WIND (frame, dht_lookup_unlink_cbk, + subvol, subvol->fops->unlink, loc, 0, NULL); return 0; } + } - gf_log (this->name, GF_LOG_DEBUG, - "linking file %s existing on %s to %s (hash)", - loc->path, cached_subvol->name, - hashed_subvol->name); - - ret = dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, - cached_subvol, hashed_subvol, loc); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + dht_lookup_everywhere_done (frame, this); } out: @@ -873,7 +1113,16 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) failed (%s)", local->loc.path, subvol->name, strerror (op_errno)); - goto err; + + /* If cached subvol returned ENOTCONN, do not do + lookup_everywhere. We need to make sure linkfile does not get + removed, which can take away the namespace, and subvol is + anyways down. */ + + if (op_errno != ENOTCONN) + goto err; + else + goto unwind; } if (check_is_dir (inode, stbuf, xattr)) { @@ -883,7 +1132,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, goto err; } - if (check_is_linkfile (inode, stbuf, xattr)) { + if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) reached link", local->loc.path, subvol->name); @@ -894,15 +1143,13 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, gf_log (this->name, GF_LOG_WARNING, "%s: gfid different on data file on %s", local->loc.path, subvol->name); + goto err; } if ((stbuf->ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) { stbuf->ia_prot.sticky = 1; } - dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino); - if (local->loc.parent) - postparent->ia_ino = local->loc.parent->ino; ret = dht_layout_preset (this, prev->this, inode); if (ret < 0) { @@ -913,9 +1160,13 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, op_errno = EINVAL; } -unwind: - WIPE (postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } +unwind: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, postparent); @@ -935,6 +1186,7 @@ dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc) int i = 0; dht_conf_t *conf = NULL; dht_local_t *local = NULL; + int ret = 0; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, unwind); @@ -953,6 +1205,19 @@ dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc) goto unwind; } + if (local->xattr != NULL) { + dict_unref (local->xattr); + local->xattr = NULL; + } + + if (!uuid_is_null (local->gfid)) { + ret = dict_set_static_bin (local->xattr_req, "gfid-req", + local->gfid, 16); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set gfid", local->loc.path); + } + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_lookup_dir_cbk, conf->subvolumes[i], @@ -982,7 +1247,6 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc_t *loc = NULL; call_frame_t *prev = NULL; int ret = 0; - uint64_t tmp_layout = 0; dht_layout_t *parent_layout = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); @@ -1013,8 +1277,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) && (loc->parent)) { - ret = inode_ctx_get (loc->parent, this, &tmp_layout); - parent_layout = (dht_layout_t *)(long)tmp_layout; + ret = dht_inode_ctx_layout_get (loc->parent, this, + &parent_layout); + if (ret || !parent_layout) + goto out; if (parent_layout->search_unhashed) { local->op_errno = ENOENT; dht_lookup_everywhere (frame, this, loc); @@ -1043,16 +1309,12 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); if (!is_linkfile) { /* non-directory and not a linkfile */ - dht_itransform (this, prev->this, stbuf->ia_ino, - &stbuf->ia_ino); - if (loc->parent) - postparent->ia_ino = loc->parent->ino; - ret = dht_layout_preset (this, prev->this, inode); if (ret < 0) { gf_log (this->name, GF_LOG_INFO, @@ -1088,14 +1350,51 @@ out: * from each of the subvolume. See dht_iatt_merge for reference. */ - WIPE (postparent); + if (!op_ret && local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, postparent); err: return 0; } +/* For directories, check if acl xattrs have been requested (by the acl xlator), + * if not, request for them. These xattrs are needed for dht dir self-heal to + * perform proper self-healing of dirs + */ +void +dht_check_and_set_acl_xattr_req (inode_t *inode, dict_t *xattr_req) +{ + int ret = 0; + + GF_ASSERT (inode); + GF_ASSERT (xattr_req); + + if (inode->ia_type != IA_IFDIR) + return; + + if (!dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR)) { + ret = dict_set_int8 (xattr_req, POSIX_ACL_ACCESS_XATTR, 0); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set key %s", + POSIX_ACL_ACCESS_XATTR); + } + + if (!dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR)) { + ret = dict_set_int8 (xattr_req, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set key %s", + POSIX_ACL_DEFAULT_XATTR); + } + + return; +} int dht_lookup (call_frame_t *frame, xlator_t *this, @@ -1103,7 +1402,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this, { xlator_t *subvol = NULL; xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; dht_local_t *local = NULL; dht_conf_t *conf = NULL; int ret = -1; @@ -1111,26 +1409,33 @@ dht_lookup (call_frame_t *frame, xlator_t *this, dht_layout_t *layout = NULL; int i = 0; int call_cnt = 0; - + loc_t new_loc = {0,}; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; if (!conf) goto err; - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); if (!local) { op_errno = ENOMEM; goto err; } - if (!dht_filter_loc_subvol_key (this, loc, &local->loc, - &hashed_subvol)) { - ret = loc_dup (loc, &local->loc); + + ret = dht_filter_loc_subvol_key (this, loc, &new_loc, + &hashed_subvol); + if (ret) { + loc_wipe (&local->loc); + ret = loc_dup (&new_loc, &local->loc); + + /* we no more need 'new_loc' entries */ + loc_wipe (&new_loc); + + /* check if loc_dup() is successful */ if (ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_DEBUG, @@ -1146,16 +1451,19 @@ dht_lookup (call_frame_t *frame, xlator_t *this, local->xattr_req = dict_new (); } + if (uuid_is_null (loc->pargfid) && !uuid_is_null (loc->gfid) && + !__is_root_gfid (loc->inode->gfid)) { + local->cached_subvol = NULL; + dht_discover (frame, this, loc); + return 0; + } + if (!hashed_subvol) hashed_subvol = dht_subvol_get_hashed (this, loc); - cached_subvol = dht_subvol_get_cached (this, loc->inode); - - local->cached_subvol = cached_subvol; local->hashed_subvol = hashed_subvol; if (is_revalidate (loc)) { - local->layout = layout = dht_layout_get (this, loc->inode); - + layout = local->layout; if (!layout) { gf_log (this->name, GF_LOG_DEBUG, "revalidate without cache. path=%s", @@ -1164,8 +1472,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this, goto err; } - local->ia_ino = loc->inode->ino; - if (layout->gen && (layout->gen < conf->gen)) { gf_log (this->name, GF_LOG_TRACE, "incomplete layout failure for path=%s", @@ -1173,34 +1479,63 @@ dht_lookup (call_frame_t *frame, xlator_t *this, dht_layout_unref (this, local->layout); local->layout = NULL; + local->cached_subvol = NULL; goto do_fresh_lookup; } - local->inode = inode_ref (loc->inode); - - local->call_cnt = 1; - call_cnt = local->call_cnt; + local->inode = inode_ref (loc->inode); /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, * revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); - subvol = local->cached_subvol; + if (IA_ISDIR (local->inode->ia_type)) { + local->call_cnt = call_cnt = conf->subvolume_cnt; + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_revalidate_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + loc, local->xattr_req); + } + return 0; + } - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + call_cnt = local->call_cnt = layout->cnt; + /* need it for self-healing linkfiles which is + 'in-migration' state */ + ret = dict_set_uint32 (local->xattr_req, + GLUSTERFS_OPEN_FD_COUNT, 4); + + /* need it for dir self-heal */ + dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); + + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); + + } } else { do_fresh_lookup: /* TODO: remove the hard-coding */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); + + /* need it for self-healing linkfiles which is + 'in-migration' state */ + ret = dict_set_uint32 (local->xattr_req, + GLUSTERFS_OPEN_FD_COUNT, 4); + + /* need it for dir self-heal */ + dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); if (!hashed_subvol) { gf_log (this->name, GF_LOG_DEBUG, @@ -1235,294 +1570,8 @@ dht_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; -} - - -int -dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - local->op_ret = -1; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } - - dht_iatt_merge (this, &local->prebuf, prebuf, prev->this); - dht_iatt_merge (this, &local->stbuf, postbuf, prev->this); - - if (local->inode) { - local->stbuf.ia_ino = local->inode->ino; - local->prebuf.ia_ino = local->inode->ino; - } - - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); -out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (truncate, frame, local->op_ret, local->op_errno, - &local->prebuf, &local->stbuf); -err: - return 0; -} - - - -int -dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *stbuf) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } - - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - - if (local->inode) - local->stbuf.ia_ino = local->inode->ino; - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); -out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, - &local->stbuf); -err: - return 0; -} - - -int -dht_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int i = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - local->layout = layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - local->inode = inode_ref (loc->inode); - local->call_cnt = layout->cnt; - - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->stat, - loc); - } - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL); - - return 0; -} - - -int -dht_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int i = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - local->layout = layout = dht_layout_get (this, fd->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - local->inode = inode_ref (fd->inode); - local->call_cnt = layout->cnt;; - - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->fstat, - fd); - } - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL); - - return 0; -} - - -int -dht_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; - - STACK_WIND (frame, dht_truncate_cbk, - subvol, subvol->fops->truncate, - loc, offset); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int -dht_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - local->inode = inode_ref (fd->inode); - local->call_cnt = 1; - - STACK_WIND (frame, dht_truncate_cbk, - subvol, subvol->fops->ftruncate, - fd, offset); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); - + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); return 0; } @@ -1530,7 +1579,7 @@ err: int dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -1549,21 +1598,23 @@ dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } - preparent->ia_ino = local->loc.parent->ino; - postparent->ia_ino = local->loc.parent->ino; local->op_ret = 0; local->postparent = *postparent; local->preparent = *preparent; - WIPE (&local->postparent); - WIPE (&local->preparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } } unlock: UNLOCK (&frame->lock); DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); return 0; } @@ -1572,7 +1623,7 @@ unlock: int dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -1584,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { - if (op_ret == -1) { + if ((op_ret == -1) && !((op_errno == ENOENT) || + (op_errno == ENOTCONN))) { local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", @@ -1597,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, unlock: UNLOCK (&frame->lock); - if (op_ret == -1) + if (local->op_ret == -1) goto err; cached_subvol = dht_subvol_get_cached (this, local->loc.inode); @@ -1611,26 +1663,24 @@ unlock: STACK_WIND (frame, dht_unlink_cbk, cached_subvol, cached_subvol->fops->unlink, - &local->loc); + &local->loc, local->flags, NULL); return 0; err: DHT_STACK_UNWIND (unlink, frame, -1, local->op_errno, - NULL, NULL); + NULL, NULL, NULL); return 0; } - int -dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, struct iatt *prebuf, struct iatt *postbuf) +dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; - local = frame->local; prev = cookie; @@ -1649,278 +1699,275 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, unlock: UNLOCK (&frame->lock); - if (local && (op_ret == 0)) { - prebuf->ia_ino = local->ia_ino; - postbuf->ia_ino = local->ia_ino; - } - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - prebuf, postbuf); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, NULL); + } return 0; } - - -int -dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) +static void +fill_layout_info (dht_layout_t *layout, char *buf) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } - - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + int i = 0; + char tmp_buf[128] = {0,}; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, local->op_errno); + for (i = 0; i < layout->cnt; i++) { + snprintf (tmp_buf, 128, "(%s %u %u)", + layout->list[i].xlator->name, + layout->list[i].start, + layout->list[i].stop); + if (i) + strcat (buf, " "); + strcat (buf, tmp_buf); } - - return 0; } +void +dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local, + char *xattr_buf, int32_t alloc_len, + int flag, char *layout_buf) +{ + if (flag && local->xattr_val) + snprintf (xattr_buf, alloc_len, + "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))", + this->name, local->xattr_val, this->name, + layout_buf); + else if (local->xattr_val) + snprintf (xattr_buf, alloc_len, + "(<"DHT_PATHINFO_HEADER"%s> %s)", + this->name, local->xattr_val); + else if (flag) + snprintf (xattr_buf, alloc_len, "(%s-layout %s)", + this->name, layout_buf); +} int -dht_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) +dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this, + int op_errno) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + int ret = -1; + char *value = NULL; + int32_t plen = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; + ret = dict_get_str (xattr, local->xsel, &value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Subvolume %s returned -1 (%s)", this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + goto out; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } + local->alloc_len += strlen(value); - local->call_cnt = 1; + if (!local->xattr_val) { + local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); + local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->access, - loc, mask); + if (local->xattr_val) { + plen = strlen (local->xattr_val); + if (plen) { + /* extra byte(s) for \0 to be safe */ + local->alloc_len += (plen + 2); + local->xattr_val = GF_REALLOC (local->xattr_val, + local->alloc_len); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } - return 0; + (void) strcat (local->xattr_val, value); + (void) strcat (local->xattr_val, " "); + local->op_ret = 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (access, frame, -1, op_errno); + ret = 0; - return 0; + out: + return ret; } - int -dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, const char *path, struct iatt *sbuf) +dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, + gf_boolean_t flag) { - dht_local_t *local = NULL; + int ret = -1; + char *xattr_buf = NULL; + char layout_buf[8192] = {0,}; - local = frame->local; - if (op_ret == -1) - goto err; + if (flag) + fill_layout_info (local->layout, layout_buf); + + *dict = dict_new (); + if (!*dict) + goto out; + + local->xattr_val[strlen (local->xattr_val) - 1] = '\0'; + + /* we would need max this many bytes to create xattr string + * extra 40 bytes is just an estimated amount of additional + * space required as we include translator name and some + * spaces, brackets etc. when forming the pathinfo string. + * + * For node-uuid we just don't have all the pretty formatting, + * but since this is a generic routine for pathinfo & node-uuid + * we dont have conditional space allocation and try to be + * generic + */ + local->alloc_len += (2 * strlen (this->name)) + + strlen (layout_buf) + + 40; + xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!xattr_buf) + goto out; - if (local) { - sbuf->ia_ino = local->ia_ino; + if (XATTR_IS_PATHINFO (local->xsel)) { + (void) dht_fill_pathinfo_xattr (this, local, xattr_buf, + local->alloc_len, flag, + layout_buf); + } else if (XATTR_IS_NODE_UUID (local->xsel)) { + (void) snprintf (xattr_buf, local->alloc_len, "%s", + local->xattr_val); } else { - op_ret = -1; - op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, + "Unknown local->xsel (%s)", local->xsel); + goto out; } -err: - DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, sbuf); + ret = dict_set_dynstr (*dict, local->xsel, xattr_buf); + GF_FREE (local->xattr_val); - return 0; + out: + return ret; } - int -dht_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) +dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + int ret = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + dict_t *dict = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + local = frame->local; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + if (op_ret < 0) { + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr err (%s) for dir", + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } - local->ia_ino = loc->inode->ino; + goto unlock; + } - STACK_WIND (frame, dht_readlink_cbk, - subvol, subvol->fops->readlink, - loc, size); + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); + } + unlock: + UNLOCK (&frame->lock); - return 0; + if (!is_last_call (this_call_cnt)) + goto out; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL); + /* -- last call: do patch ups -- */ - return 0; -} + if (local->op_ret == -1) { + goto unwind; + } + ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true); + if (ret) + goto unwind; -int -dht_fix_layout_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) -{ - DHT_STACK_UNWIND (getxattr, frame, -1, ENODATA, NULL); + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); + out: return 0; } -static void -fill_layout_info (dht_layout_t *layout, char *buf) -{ - int i = 0; - char tmp_buf[128] = {0,}; - - for (i = 0; i < layout->cnt; i++) { - snprintf (tmp_buf, 128, "(%s %u %u)", - layout->list[i].xlator->name, - layout->list[i].start, - layout->list[i].stop); - if (i) - strcat (buf, " "); - strcat (buf, tmp_buf); - } -} - int -dht_pathinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) +dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = 0; - int flag = 0; - int this_call_cnt = 0; - char *value_got = NULL; - char layout_buf[8192] = {0,}; - char xattr_buf[8192 + 1024] = {0,}; - dict_t *dict = NULL; + dht_local_t *local = NULL; + int ret = 0; + dict_t *dict = NULL; + call_frame_t *prev = NULL; + gf_boolean_t flag = _gf_true; local = frame->local; + prev = cookie; - if (op_ret != -1) { - ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value_got); - if (!ret) { - if (!local->pathinfo) - local->pathinfo = GF_CALLOC (8192, sizeof (char), - gf_common_mt_char); - if (local->pathinfo) - strcat (local->pathinfo, value_got); - } + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 " + "(%s)", prev->this->name, strerror (op_errno)); + goto unwind; } - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->layout->cnt > 1) { - /* Set it for directory */ - fill_layout_info (local->layout, layout_buf); - flag = 1; - } - - dict = dict_new (); - - if (flag && local->pathinfo) - snprintf (xattr_buf, 9216, "((%s %s) (%s-layout %s))", - this->name, local->pathinfo, this->name, - layout_buf); - else if (local->pathinfo) - snprintf (xattr_buf, 9216, "(%s %s)", - this->name, local->pathinfo); - else if (flag) - snprintf (xattr_buf, 9216, "(%s-layout %s)", - this->name, layout_buf); - - ret = dict_set_str (dict, GF_XATTR_PATHINFO_KEY, - xattr_buf); - - if (local->pathinfo) - GF_FREE (local->pathinfo); - - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - - if (dict) - dict_unref (dict); - - return 0; + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); + goto unwind; } - if (local->pathinfo) - strcat (local->pathinfo, " Link: "); - if (local->hashed_subvol) { - /* This will happen if there pending */ - STACK_WIND (frame, dht_pathinfo_getxattr_cbk, local->hashed_subvol, - local->hashed_subvol->fops->getxattr, - &local->loc, local->key); + flag = (local->layout->cnt > 1) ? _gf_true : _gf_false; - return 0; - } + ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag); + if (ret) + goto unwind; - gf_log ("this->name", GF_LOG_ERROR, "Unable to find hashed_subvol for path" - " %s", local->pathinfo); + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; + + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, + NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, dict); return 0; } int dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) { int ret = 0; char *value = NULL; @@ -1935,21 +1982,24 @@ dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr); + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); return 0; } int dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { int this_call_cnt = 0; dht_local_t *local = NULL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (frame->local, out); + VALIDATE_OR_GOTO (this->private, out); + conf = this->private; local = frame->local; this_call_cnt = dht_frame_return (frame); @@ -1957,8 +2007,8 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!xattr || (op_ret == -1)) goto out; - if (dict_get (xattr, "trusted.glusterfs.dht")) { - dict_del (xattr, "trusted.glusterfs.dht"); + if (dict_get (xattr, conf->xattr_name)) { + dict_del (xattr, conf->xattr_name); } local->op_ret = 0; @@ -1966,31 +2016,97 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->xattr = dict_copy_with_ref (xattr, NULL); } else { /* first aggregate everything into xattr and then copy into - * local->xattr. + * local->xattr. This is required as we want to have + * 'local->xattr' as the proper final dictionary passed above + * distribute xlator. */ dht_aggregate_xattr (xattr, local->xattr); local->xattr = dict_copy (xattr, local->xattr); } out: if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, local->xattr); + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, NULL); } return 0; } int32_t dht_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } int +dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + + + local = frame->local; + + if (op_ret != -1) { + if (local->xattr) + dict_unref (local->xattr); + local->xattr = dict_ref (xattr); + + if (local->xattr_req) + dict_unref (local->xattr_req); + local->xattr_req = dict_ref (xdata); + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, local->xattr_req); + } + + return 0; +} + + +int +dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key, dict_t *xdata) +{ + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int cnt = 0; + xlator_t *subvol = NULL; + + + local = frame->local; + layout = local->layout; + + cnt = local->call_cnt = layout->cnt; + + local->op_ret = -1; + local->op_errno = ENODATA; + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, + subvol, subvol->fops->getxattr, + loc, key, xdata); + } + + return 0; +} + + +int dht_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) + loc_t *loc, const char *key, dict_t *xdata) +#define DHT_IS_DIR(layout) (layout->cnt > 1) { + xlator_t *subvol = NULL; xlator_t *hashed_subvol = NULL; xlator_t *cached_subvol = NULL; @@ -1999,8 +2115,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, dht_layout_t *layout = NULL; xlator_t **sub_volumes = NULL; int op_errno = -1; - int ret = 0; - int flag = 0; int i = 0; int cnt = 0; @@ -2008,64 +2122,94 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; - layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_ERROR, - "layout is NULL"); - op_errno = ENOENT; - goto err; - } - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_GETXATTR); if (!local) { op_errno = ENOMEM; goto err; } - if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)) { - hashed_subvol = dht_subvol_get_hashed (this, loc); - cached_subvol = dht_subvol_get_cached (this, loc->inode); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } - - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout is NULL"); + op_errno = ENOENT; + goto err; + } - goto err; - } + if (key) { local->key = gf_strdup (key); if (!local->key) { op_errno = ENOMEM; - goto err; } - local->layout = layout; + } - local->call_cnt = 1; - if (hashed_subvol != cached_subvol) { - local->call_cnt = 2; - local->hashed_subvol = hashed_subvol; + if (key && + (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) + && DHT_IS_DIR(layout)) { + dht_getxattr_get_real_filename (frame, this, loc, key, xdata); + return 0; + } + + /* for file use cached subvolume (obviously!): see if {} + * below + * for directory: + * wind to all subvolumes and exclude subvolumes which + * return ENOTCONN (in callback) + * + * NOTE: Don't trust inode here, as that may not be valid + * (until inode_link() happens) + */ + if (key && DHT_IS_DIR(layout) && + ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) + || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) { + (void) strncpy (local->xsel, key, 256); + cnt = local->call_cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_vgetxattr_dir_cbk, + subvol, subvol->fops->getxattr, + loc, key, NULL); } + return 0; + } - STACK_WIND (frame, dht_pathinfo_getxattr_cbk, cached_subvol, - cached_subvol->fops->getxattr, loc, key); + /* node-uuid or pathinfo for files */ + if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0) + || (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0))) { + cached_subvol = local->cached_subvol; + (void) strncpy (local->xsel, key, 256); + + local->call_cnt = 1; + STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol, + cached_subvol->fops->getxattr, loc, key, NULL); return 0; } + if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) { hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "cached subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + if (hashed_subvol == cached_subvol) { op_errno = ENODATA; goto err; @@ -2073,41 +2217,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (hashed_subvol) { STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, hashed_subvol->fops->getxattr, loc, - GF_XATTR_PATHINFO_KEY); - return 0; - } - op_errno = ENODATA; - goto err; - } - if (key && (strcmp (key, GF_XATTR_FIX_LAYOUT_KEY) == 0)) { - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].start == layout->list[i].stop) { - flag = 1; - break; - } - } - if ((layout->cnt < conf->subvolume_cnt) || flag) { - gf_log (this->name, GF_LOG_INFO, - "expanding layout of %s from %d to %d", - loc->path, layout->cnt, conf->subvolume_cnt); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } - - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - - goto err; - } - local->layout = layout; - //layout = dht_layout_new (this, conf->subvolume_cnt); - - dht_selfheal_new_directory (frame, dht_fix_layout_cbk, - layout); + GF_XATTR_PATHINFO_KEY, NULL); return 0; } op_errno = ENODATA; @@ -2115,13 +2225,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } if (key && (!strcmp (GF_XATTR_MARKER_KEY, key)) - && (-1 == frame->root->pid)) { - - if (loc->inode-> ia_type == IA_IFDIR) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { cnt = layout->cnt; } else { cnt = 1; } + sub_volumes = alloca ( cnt * sizeof (xlator_t *)); for (i = 0; i < cnt; i++) *(sub_volumes + i) = layout->list[i].xlator; @@ -2129,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, key, local, dht_getxattr_unwind, sub_volumes, cnt, - MARKER_UUID_TYPE, conf->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + conf->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -2139,8 +2250,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (key && *conf->vol_uuid) { if ((match_uuid_local (key, conf->vol_uuid) == 0) && - (-1 == frame->root->pid)) { - if (loc->inode-> ia_type == IA_IFDIR) { + (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { cnt = layout->cnt; } else { cnt = 1; @@ -2153,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, local, dht_getxattr_unwind, sub_volumes, cnt, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, conf->vol_uuid)) { op_errno = EINVAL; goto err; @@ -2162,24 +2274,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - - goto err; - } - - if (key) { - local->key = gf_strdup (key); - if (!local->key) { - op_errno = ENOMEM; - - goto err; - } - } - local->layout = layout; - - if (loc->inode-> ia_type == IA_IFDIR) { + if (DHT_IS_DIR(layout)) { cnt = local->call_cnt = layout->cnt; } else { cnt = local->call_cnt = 1; @@ -2189,172 +2284,215 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, subvol = layout->list[i].xlator; STACK_WIND (frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, - loc, key); + loc, key, NULL); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); return 0; } +#undef DHT_IS_DIR int -dht_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xattr, int flags) +dht_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *key, dict_t *xdata) { - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - int op_errno = EINVAL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FGETXATTR); + if (!local) { + op_errno = ENOMEM; - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; goto err; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout is NULL"); + op_errno = ENOENT; goto err; } - local->inode = inode_ref (fd->inode); - local->call_cnt = 1; + if (key) { + local->key = gf_strdup (key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + } - STACK_WIND (frame, dht_err_cbk, subvol, subvol->fops->fsetxattr, - fd, xattr, flags); + if ((fd->inode->ia_type == IA_IFDIR) + && (strncmp (key, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY) != 0))) { + cnt = local->call_cnt = layout->cnt; + } else { + cnt = local->call_cnt = 1; + } + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_cbk, + subvol, subvol->fops->fgetxattr, + fd, key, NULL); + } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno); + DHT_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); return 0; } - int -dht_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr, int flags) +dht_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *xattr, int flags, dict_t *xdata) { xlator_t *subvol = NULL; dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - dht_layout_t *layout = NULL; - int i = 0; int op_errno = EINVAL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); - conf = this->private; - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + conf = this->private; + + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); - local = dht_local_init (frame); + local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR); if (!local) { op_errno = ENOMEM; goto err; } - local->layout = layout = dht_layout_get (this, loc->inode); - if (!layout) { + subvol = local->cached_subvol; + if (!subvol) { gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); + "no cached subvolume for fd=%p", fd); op_errno = EINVAL; goto err; } - local->call_cnt = layout->cnt; + local->call_cnt = 1; - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_err_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setxattr, - loc, xattr, flags); - } + STACK_WIND (frame, dht_err_cbk, subvol, subvol->fops->fsetxattr, + fd, xattr, flags, NULL); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setxattr, frame, -1, op_errno); + DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); return 0; } +static int +dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + int -dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) +dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) { + int i = -1; + int ret = -1; + char *value = NULL; dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; int this_call_cnt = 0; - call_frame_t *prev = NULL; local = frame->local; prev = cookie; + conf = this->private; - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + if (op_ret == -1) + goto out; - local->op_ret = 0; + + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); + if (ret) + goto out; + + if (!strcmp (value, local->key)) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev->this) + conf->decommissioned_bricks[i] = prev->this; + } } -unlock: - UNLOCK (&frame->lock); +out: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (removexattr, frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP, NULL); } - return 0; -} +} int -dht_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) +dht_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr, int flags, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - - int i; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int op_errno = EINVAL; + int ret = -1; + data_t *tmp = NULL; + uint32_t dir_spread = 0; + char value[4096] = {0,}; + gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - subvol = dht_subvol_get_cached (this, loc->inode); + conf = this->private; + + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; if (!subvol) { gf_log (this->name, GF_LOG_DEBUG, "no cached subvolume for path=%s", loc->path); @@ -2362,48 +2500,148 @@ dht_removexattr (call_frame_t *frame, xlator_t *this, goto err; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for path=%s", loc->path); + op_errno = EINVAL; goto err; } - local->layout = layout = dht_layout_get (this, loc->inode); - if (!local->layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); + local->call_cnt = call_cnt = layout->cnt; + + tmp = dict_get (xattr, "distribute.migrate-data"); + if (tmp) { + if (IA_ISDIR (loc->inode->ia_type)) { + op_errno = ENOTSUP; + goto err; + } + + /* TODO: need to interpret the 'value' for more meaning + (ie, 'target' subvolume given there, etc) */ + memcpy (value, tmp->data, tmp->len); + if (strcmp (value, "force") == 0) + forced_rebalance = + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; + + if (conf->decommission_in_progress) + forced_rebalance = GF_DHT_MIGRATE_HARDLINK; + + local->rebalance.target_node = dht_subvol_get_hashed (this, loc); + if (!local->rebalance.target_node) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->rebalance.from_subvol = local->cached_subvol; + + if (local->rebalance.target_node == local->rebalance.from_subvol) { + op_errno = EEXIST; + goto err; + } + if (local->rebalance.target_node) { + local->flags = forced_rebalance; + + ret = dht_start_rebalance_task (this, frame); + if (!ret) + return 0; + + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to create a new synctask", + loc->path); + } op_errno = EINVAL; goto err; + } - local->call_cnt = layout->cnt; + tmp = dict_get (xattr, "decommission-brick"); + if (tmp) { + /* This operation should happen only on '/' */ + if (!__is_root_gfid (loc->inode->gfid)) { + op_errno = ENOTSUP; + goto err; + } - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_removexattr_cbk, + memcpy (value, tmp->data, ((tmp->len < 4095) ? tmp->len : 4095)); + local->key = gf_strdup (value); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0 ; i < conf->subvolume_cnt; i++) { + /* Get the pathinfo, and then compare */ + STACK_WIND (frame, dht_checking_pathinfo_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, + loc, GF_XATTR_PATHINFO_KEY, NULL); + } + return 0; + } + + tmp = dict_get (xattr, GF_XATTR_FIX_LAYOUT_KEY); + if (tmp) { + gf_log (this->name, GF_LOG_INFO, + "fixing the layout of %s", loc->path); + + ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; + } + + tmp = dict_get (xattr, "distribute.directory-spread-count"); + if (tmp) { + /* Setxattr value is packed as 'binary', not string */ + memcpy (value, tmp->data, ((tmp->len < 4095)?tmp->len:4095)); + ret = gf_string2uint32 (value, &dir_spread); + if (!ret && ((dir_spread <= conf->subvolume_cnt) && + (dir_spread > 0))) { + layout->spread_cnt = dir_spread; + + ret = dht_fix_directory_layout (frame, + dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; + } + gf_log (this->name, GF_LOG_ERROR, + "wrong 'directory-spread-count' value (%s)", value); + op_errno = ENOTSUP; + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_err_cbk, layout->list[i].xlator, - layout->list[i].xlator->fops->removexattr, - loc, key); + layout->list[i].xlator->fops->setxattr, + loc, xattr, flags, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (removexattr, frame, -1, op_errno); + DHT_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); return 0; } int -dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) +dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; - local = frame->local; prev = cookie; @@ -2423,331 +2661,175 @@ unlock: UNLOCK (&frame->lock); this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (removexattr, frame, local->op_ret, + local->op_errno, NULL); + } return 0; } int -dht_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, fd_t *fd, int wbflags) +dht_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key, dict_t *xdata) { xlator_t *subvol = NULL; - int ret = -1; int op_errno = -1; dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = NULL; + int i; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + conf = this->private; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); - goto err; - } + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); - local->fd = fd_ref (fd); - ret = loc_dup (loc, &local->loc); - if (ret == -1) { + local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR); + if (!local) { op_errno = ENOMEM; - goto err; } - local->call_cnt = 1; - - STACK_WIND (frame, dht_fd_cbk, - subvol, subvol->fops->open, - loc, flags, fd, wbflags); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL); - - return 0; -} - - -int -dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - struct iovec *vector, int count, struct iatt *stbuf, - struct iobref *iobref) -{ - dht_local_t *local = frame->local; - - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - if (op_ret != -1) - stbuf->ia_ino = local->ia_ino; -out: - DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf, - iobref); - - return 0; -} - - -int -dht_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); + subvol = local->cached_subvol; if (!subvol) { gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); + "no cached subvolume for path=%s", loc->path); op_errno = EINVAL; goto err; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - local->ia_ino = fd->inode->ino; - STACK_WIND (frame, dht_readv_cbk, - subvol, subvol->fops->readv, - fd, size, off); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); - - return 0; -} - - -int -dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - dht_local_t *local = NULL; - - if (op_ret == -1) { - goto out; - } - - local = frame->local; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - prebuf->ia_ino = local->ia_ino; - postbuf->ia_ino = local->ia_ino; - -out: - DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf); - - return 0; -} - - -int -dht_writev (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iovec *vector, int count, off_t off, - struct iobref *iobref) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { + layout = local->layout; + if (!local->layout) { gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); + "no layout for path=%s", loc->path); op_errno = EINVAL; goto err; } - local = dht_local_init (frame); - if (!local) { + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup (key); - op_errno = ENOMEM; - goto err; + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_removexattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->removexattr, + loc, key, NULL); } - local->ia_ino = fd->inode->ino; - - STACK_WIND (frame, dht_writev_cbk, - subvol, subvol->fops->writev, - fd, vector, count, off, iobref); - return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } - int -dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +dht_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *key, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = 0; + int i; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + conf = this->private; - local = dht_local_init (frame); + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO (frame, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FREMOVEXATTR); if (!local) { op_errno = ENOMEM; - goto err; } - local->fd = fd_ref (fd); - local->call_cnt = 1; - - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->flush, fd); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (flush, frame, -1, op_errno); - - return 0; -} - - -int -dht_fsync (call_frame_t *frame, xlator_t *this, - fd_t *fd, int datasync) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); + subvol = local->cached_subvol; if (!subvol) { gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); + "no cached subvolume for inode=%s", + uuid_utoa (fd->inode->gfid)); op_errno = EINVAL; goto err; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - + layout = local->layout; + if (!local->layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for inode=%s", uuid_utoa (fd->inode->gfid)); + op_errno = EINVAL; goto err; } - local->call_cnt = 1; - local->ia_ino = fd->inode->ino; + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup (key); - STACK_WIND (frame, dht_fsync_cbk, - subvol, subvol->fops->fsync, - fd, datasync); + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_removexattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fremovexattr, + fd, key, NULL); + } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int -dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct gf_flock *flock) -{ - DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock); + DHT_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); return 0; } int -dht_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int cmd, struct gf_flock *flock) +dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + local = frame->local; + prev = cookie; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; + local->op_ret = 0; } +unlock: + UNLOCK (&frame->lock); - STACK_WIND (frame, dht_lk_cbk, - subvol, subvol->fops->lk, - fd, cmd, flock); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, + local->fd, NULL); return 0; } @@ -2777,7 +2859,7 @@ dht_normalize_stats (struct statvfs *buf, unsigned long bsize, int dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) + int op_ret, int op_errno, struct statvfs *statvfs, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -2823,59 +2905,79 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->statvfs); + &local->statvfs, xdata); return 0; } int -dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { + xlator_t *subvol = NULL; dht_local_t *local = NULL; dht_conf_t *conf = NULL; int op_errno = -1; int i = -1; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; - local = dht_local_init (frame); - local->call_cnt = conf->subvolume_cnt; + local = dht_local_init (frame, NULL, NULL, GF_FOP_STATFS); + if (!local) { + op_errno = ENOMEM; + goto err; + } - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_statfs_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, loc); + if (IA_ISDIR (loc->inode->ia_type)) { + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_statfs_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, loc, + xdata); + } + return 0; } + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_statfs_cbk, + subvol, subvol->fops->statfs, loc, xdata); + return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } int -dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; - int ret = -1; int op_errno = -1; int i = -1; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); @@ -2883,35 +2985,27 @@ dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) conf = this->private; - local = dht_local_init (frame); + local = dht_local_init (frame, loc, fd, GF_FOP_OPENDIR); if (!local) { op_errno = ENOMEM; goto err; } - local->fd = fd_ref (fd); - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - - goto err; - } - local->call_cnt = conf->subvolume_cnt; for (i = 0; i < conf->subvolume_cnt; i++) { STACK_WIND (frame, dht_fd_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - loc, fd); + loc, fd, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2919,7 +3013,7 @@ err: int dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, gf_dirent_t *orig_entries) + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { dht_local_t *local = NULL; gf_dirent_t entries; @@ -2932,6 +3026,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dht_layout_t *layout = 0; dht_conf_t *conf = NULL; xlator_t *subvol = 0; + int ret = 0; INIT_LIST_HEAD (&entries.list); prev = cookie; @@ -2948,10 +3043,13 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, list_for_each_entry (orig_entry, (&orig_entries->list), list) { next_offset = orig_entry->d_off; - - if (check_is_linkfile (NULL, (&orig_entry->d_stat), NULL) - || (check_is_dir (NULL, (&orig_entry->d_stat), NULL) - && (prev->this != dht_first_up_subvol (this)))) { + if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) && + (prev->this != local->first_up_subvol)) { + continue; + } + if (check_is_linkfile (NULL, (&orig_entry->d_stat), + orig_entry->dict, + conf->link_xattr_name)) { continue; } @@ -2967,21 +3065,37 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, orig_entry->d_name); if (!subvol || (subvol != prev->this)) { /* TODO: Count the number of entries which need - linkfile to prove its existance in fs */ + linkfile to prove its existence in fs */ layout->search_unhashed++; } } - entry->d_stat = orig_entry->d_stat; - dht_itransform (this, prev->this, orig_entry->d_ino, - &entry->d_ino); dht_itransform (this, prev->this, orig_entry->d_off, &entry->d_off); - entry->d_stat.ia_ino = entry->d_ino; + entry->d_stat = orig_entry->d_stat; + entry->d_ino = orig_entry->d_ino; entry->d_type = orig_entry->d_type; entry->d_len = orig_entry->d_len; + if (orig_entry->dict) + entry->dict = dict_ref (orig_entry->dict); + + /* making sure we set the inode ctx right with layout, + currently possible only for non-directories, so for + directories don't set entry inodes */ + if (!IA_ISDIR(entry->d_stat.ia_type)) { + ret = dht_layout_preset (this, prev->this, + orig_entry->inode); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to link the layout in inode"); + entry->inode = inode_ref (orig_entry->inode); + } else if (orig_entry->inode) { + dht_inode_ctx_time_update (orig_entry->inode, this, + &entry->d_stat, 1); + } + list_add_tail (&entry->list, &entries.list); count++; } @@ -3011,9 +3125,23 @@ done: goto unwind; } + if (conf->readdir_optimize == _gf_true) { + if (next_subvol != local->first_up_subvol) { + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } + } + STACK_WIND (frame, dht_readdirp_cbk, next_subvol, next_subvol->fops->readdirp, - local->fd, local->size, next_offset); + local->fd, local->size, next_offset, + local->xattr); return 0; } @@ -3021,7 +3149,7 @@ unwind: if (op_ret < 0) op_ret = 0; - DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries); + DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free (&entries); @@ -3032,7 +3160,8 @@ unwind: int dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries) + int op_ret, int op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) { dht_local_t *local = NULL; gf_dirent_t entries; @@ -3043,13 +3172,11 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, off_t next_offset = 0; int count = 0; dht_layout_t *layout = 0; - dht_conf_t *conf = NULL; xlator_t *subvol = 0; INIT_LIST_HEAD (&entries.list); prev = cookie; local = frame->local; - conf = this->private; if (op_ret < 0) goto done; @@ -3072,11 +3199,10 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unwind; } - dht_itransform (this, prev->this, orig_entry->d_ino, - &entry->d_ino); dht_itransform (this, prev->this, orig_entry->d_off, &entry->d_off); + entry->d_ino = orig_entry->d_ino; entry->d_type = orig_entry->d_type; entry->d_len = orig_entry->d_len; @@ -3112,7 +3238,7 @@ done: STACK_WIND (frame, dht_readdir_cbk, next_subvol, next_subvol->fops->readdir, - local->fd, local->size, next_offset); + local->fd, local->size, next_offset, NULL); return 0; } @@ -3120,7 +3246,7 @@ unwind: if (op_ret < 0) op_ret = 0; - DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries); + DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free (&entries); @@ -3130,46 +3256,76 @@ unwind: int dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff, int whichop) + off_t yoff, int whichop, dict_t *dict) { dht_local_t *local = NULL; - dht_conf_t *conf = NULL; int op_errno = -1; xlator_t *xvol = NULL; off_t xoff = 0; - + int ret = 0; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); conf = this->private; - local = dht_local_init (frame); + local = dht_local_init (frame, NULL, NULL, whichop); if (!local) { - op_errno = ENOMEM; goto err; } local->fd = fd_ref (fd); local->size = size; + local->xattr_req = (dict)? dict_ref (dict) : NULL; + local->first_up_subvol = dht_first_up_subvol (this); dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); /* TODO: do proper readdir */ - if (whichop == GF_FOP_READDIR) - STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, - fd, size, xoff); - else + if (whichop == GF_FOP_READDIRP) { + if (dict) + local->xattr = dict_ref (dict); + else + local->xattr = dict_new (); + + if (local->xattr) { + ret = dict_set_uint32 (local->xattr, + conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to set '%s' key", + conf->link_xattr_name); + if (conf->readdir_optimize == _gf_true) { + if (xvol != local->first_up_subvol) { + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_log (this->name, + GF_LOG_ERROR, + "Dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } + } + } + STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp, - fd, size, xoff); + fd, size, xoff, local->xattr); + } else { + STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, + fd, size, xoff, local->xattr); + } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); return 0; } @@ -3177,7 +3333,7 @@ err: int dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) + off_t yoff, dict_t *xdata) { int op = GF_FOP_READDIR; dht_conf_t *conf = NULL; @@ -3198,15 +3354,15 @@ dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, op = GF_FOP_READDIRP; out: - dht_do_readdir (frame, this, fd, size, yoff, op); + dht_do_readdir (frame, this, fd, size, yoff, op, 0); return 0; } int dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) + off_t yoff, dict_t *dict) { - dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP); + dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); return 0; } @@ -3214,7 +3370,7 @@ dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, int dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -3234,21 +3390,22 @@ dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, + local->op_errno, xdata); return 0; } int -dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int datasync, dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; int op_errno = -1; int i = -1; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); @@ -3256,10 +3413,9 @@ dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) conf = this->private; - local = dht_local_init (frame); + local = dht_local_init (frame, NULL, NULL, GF_FOP_FSYNCDIR); if (!local) { op_errno = ENOMEM; - goto err; } @@ -3270,14 +3426,14 @@ dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) STACK_WIND (frame, dht_fsyncdir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->fsyncdir, - fd, datasync); + fd, datasync, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno); + DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } @@ -3287,9 +3443,9 @@ int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - call_frame_t *prev = NULL; + xlator_t *prev = NULL; int ret = -1; dht_local_t *local = NULL; @@ -3306,24 +3462,25 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; - dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino); if (local->loc.parent) { - preparent->ia_ino = local->loc.parent->ino; - postparent->ia_ino = local->loc.parent->ino; - WIPE (preparent); - WIPE (postparent); + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); } - ret = dht_layout_preset (this, prev->this, inode); + ret = dht_layout_preset (this, prev, inode); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "could not set pre-set layout for subvolume %s", - prev->this->name); + prev? prev->name: NULL); op_ret = -1; op_errno = EINVAL; goto out; } + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); out: /* * FIXME: ia_size and st_blocks of preparent and postparent do not have @@ -3332,9 +3489,9 @@ out: * corresponding values from each of the subvolume. * See dht_iatt_merge for reference. */ - - DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); return 0; } @@ -3343,7 +3500,8 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -3352,42 +3510,43 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, goto err; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + goto err; + } + cached_subvol = local->cached_subvol; - STACK_WIND (frame, dht_newfile_cbk, - cached_subvol, cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, - local->params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol, + cached_subvol, cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, + local->params); return 0; err: - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } int dht_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) { xlator_t *subvol = NULL; int op_errno = -1; - int ret = -1; xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; dht_local_t *local = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); - conf = this->private; - dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); if (!local) { op_errno = ENOMEM; - goto err; } @@ -3400,22 +3559,17 @@ dht_mknod (call_frame_t *frame, xlator_t *this, goto err; } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - - goto err; - } - if (!dht_is_subvol_filled (this, subvol)) { gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, + subvol, subvol->fops->mknod, loc, mode, + rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol); + + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); if (avail_subvol != subvol) { /* Choose the minimum filled volume, and create the files there */ @@ -3424,17 +3578,18 @@ dht_mknod (call_frame_t *frame, xlator_t *this, local->cached_subvol = avail_subvol; local->mode = mode; local->rdev = rdev; - + local->umask = umask; dht_linkfile_create (frame, dht_mknod_linkfile_create_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); } else { gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, + rdev, umask, params); } } @@ -3443,7 +3598,7 @@ dht_mknod (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } @@ -3451,22 +3606,19 @@ err: int dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, dict_t *params) + const char *linkname, loc_t *loc, mode_t umask, dict_t *params) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; - int ret = -1; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_SYMLINK); if (!local) { op_errno = ENOMEM; - goto err; } @@ -3479,61 +3631,52 @@ dht_symlink (call_frame_t *frame, xlator_t *this, goto err; } - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "Failed to copy loc"); - op_errno = ENOMEM; - goto err; - } - gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->symlink, - linkname, loc, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->symlink, linkname, loc, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } int -dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { xlator_t *cached_subvol = NULL; xlator_t *hashed_subvol = NULL; - int ret = -1; int op_errno = -1; dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); if (dht_filter_loc_subvol_key (this, loc, &local->loc, &cached_subvol)) { - gf_log (this->name, GF_LOG_NORMAL, + gf_log (this->name, GF_LOG_INFO, "unlinking %s on %s (given path %s)", local->loc.path, cached_subvol->name, loc->path); STACK_WIND (frame, dht_unlink_cbk, cached_subvol, cached_subvol->fops->unlink, - &local->loc); + &local->loc, xflag, xdata); goto done; } - cached_subvol = dht_subvol_get_cached (this, loc->inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; + local = dht_local_init (frame, loc, NULL, GF_FOP_UNLINK); + if (!local) { + op_errno = ENOMEM; + goto err; } @@ -3546,32 +3689,29 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) goto err; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - + cached_subvol = local->cached_subvol; + if (!cached_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; goto err; } + local->flags = xflag; if (hashed_subvol != cached_subvol) { STACK_WIND (frame, dht_unlink_linkfile_cbk, - hashed_subvol, hashed_subvol->fops->unlink, loc); + hashed_subvol, hashed_subvol->fops->unlink, loc, + xflag, xdata); } else { STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, loc); + cached_subvol, cached_subvol->fops->unlink, loc, + xflag, xdata); } done: return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -3581,13 +3721,14 @@ int dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { call_frame_t *prev = NULL; dht_layout_t *layout = NULL; dht_local_t *local = NULL; prev = cookie; + local = frame->local; if (op_ret == -1) @@ -3603,17 +3744,20 @@ dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - stbuf->ia_ino = local->loc.inode->ino; - - preparent->ia_ino = local->loc2.parent->ino; - postparent->ia_ino = local->loc2.parent->ino; - - WIPE (preparent); - WIPE (postparent); - + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -3623,27 +3767,27 @@ int dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *srcvol = NULL; - if (op_ret == -1) goto err; local = frame->local; srcvol = local->linkfile.srcvol; - STACK_WIND (frame, dht_link_cbk, - srcvol, srcvol->fops->link, - &local->loc, &local->loc2); + STACK_WIND (frame, dht_link_cbk, srcvol, srcvol->fops->link, + &local->loc, &local->loc2, xdata); return 0; err: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -3651,7 +3795,7 @@ err: int dht_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { xlator_t *cached_subvol = NULL; xlator_t *hashed_subvol = NULL; @@ -3659,13 +3803,19 @@ dht_link (call_frame_t *frame, xlator_t *this, int ret = -1; dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (oldloc, err); VALIDATE_OR_GOTO (newloc, err); - cached_subvol = dht_subvol_get_cached (this, oldloc->inode); + local = dht_local_init (frame, oldloc, NULL, GF_FOP_LINK); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + cached_subvol = local->cached_subvol; if (!cached_subvol) { gf_log (this->name, GF_LOG_DEBUG, "no cached subvolume for path=%s", oldloc->path); @@ -3682,42 +3832,27 @@ dht_link (call_frame_t *frame, xlator_t *this, goto err; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } - - ret = loc_copy (&local->loc, oldloc); - if (ret == -1) { - op_errno = ENOMEM; - - goto err; - } - ret = loc_copy (&local->loc2, newloc); if (ret == -1) { op_errno = ENOMEM; - goto err; } if (hashed_subvol != cached_subvol) { - memcpy (local->gfid, oldloc->inode->gfid, 16); - dht_linkfile_create (frame, dht_link_linkfile_cbk, + uuid_copy (local->gfid, oldloc->inode->gfid); + dht_linkfile_create (frame, dht_link_linkfile_cbk, this, cached_subvol, hashed_subvol, newloc); } else { STACK_WIND (frame, dht_link_cbk, cached_subvol, cached_subvol->fops->link, - oldloc, newloc); + oldloc, newloc, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -3727,7 +3862,7 @@ int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { call_frame_t *prev = NULL; int ret = -1; @@ -3745,13 +3880,12 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; - dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino); if (local->loc.parent) { - preparent->ia_ino = local->loc.parent->ino; - postparent->ia_ino = local->loc.parent->ino; + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); - WIPE (preparent); - WIPE (postparent); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); } ret = dht_layout_preset (this, prev->this, inode); @@ -3763,10 +3897,14 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_errno = EINVAL; goto out; } - + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -3776,7 +3914,8 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -3790,23 +3929,22 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, cached_subvol, cached_subvol->fops->create, &local->loc, local->flags, local->mode, - local->fd, local->params); + local->umask, local->fd, local->params); return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); return 0; } int dht_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { int op_errno = -1; - int ret = -1; xlator_t *subvol = NULL; - dht_conf_t *conf = NULL; dht_local_t *local = NULL; xlator_t *avail_subvol = NULL; @@ -3814,34 +3952,25 @@ dht_create (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); - conf = this->private; - dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); + local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); if (!local) { - op_errno = ENOMEM; goto err; } if (dht_filter_loc_subvol_key (this, loc, &local->loc, &subvol)) { - gf_log (this->name, GF_LOG_NORMAL, + gf_log (this->name, GF_LOG_INFO, "creating %s on %s (got create on %s)", local->loc.path, subvol->name, loc->path); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - &local->loc, flags, mode, fd, params); + &local->loc, flags, mode, umask, fd, params); goto done; } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - - goto err; - } subvol = dht_subvol_get_hashed (this, loc); if (!subvol) { gf_log (this->name, GF_LOG_DEBUG, @@ -3856,40 +3985,38 @@ dht_create (call_frame_t *frame, xlator_t *this, "creating %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); goto done; } /* Choose the minimum filled volume, and create the files there */ - /* TODO */ - avail_subvol = dht_free_disk_available_subvol (this, subvol); + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); if (avail_subvol != subvol) { - local->fd = fd_ref (fd); local->params = dict_ref (params); local->flags = flags; local->mode = mode; - + local->umask = umask; local->cached_subvol = avail_subvol; local->hashed_subvol = subvol; gf_log (this->name, GF_LOG_TRACE, "creating %s on %s (link at %s)", loc->path, avail_subvol->name, subvol->name); - dht_linkfile_create (frame, - dht_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, dht_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); goto done; } gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); done: return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); return 0; } @@ -3898,30 +4025,28 @@ err: int dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; - local = frame->local; layout = local->selfheal.layout; if (op_ret == 0) { dht_layout_set (this, local->inode, layout); - local->stbuf.ia_ino = local->ia_ino; if (local->loc.parent) { - local->preparent.ia_ino = local->loc.parent->ino; - local->postparent.ia_ino = local->loc.parent->ino; + dht_inode_ctx_time_update (local->loc.parent, this, + &local->preparent, 0); - WIPE (&local->preparent); - WIPE (&local->postparent); + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); } } DHT_STACK_UNWIND (mkdir, frame, op_ret, op_errno, local->inode, &local->stbuf, &local->preparent, - &local->postparent); + &local->postparent, NULL); return 0; } @@ -3929,17 +4054,15 @@ dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, int dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; int ret = -1; - int subvol_filled = 0; + gf_boolean_t subvol_filled = _gf_false; call_frame_t *prev = NULL; dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; - conf = this->private; local = frame->local; prev = cookie; layout = local->layout; @@ -3952,9 +4075,21 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ret = dht_layout_merge (this, layout, prev->this, -1, ENOSPC, NULL); } else { + if (op_ret == -1 && op_errno == EEXIST) + /* Very likely just a race between mkdir and + self-heal (from lookup of a concurrent mkdir + attempt). + Ignore error for now. layout setting will + anyways fail if this was a different (old) + pre-existing different directory. + */ + op_ret = 0; ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, NULL); } + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); if (op_ret == -1) { local->op_errno = op_errno; @@ -3964,11 +4099,6 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_iatt_merge (this, &local->preparent, preparent, prev->this); dht_iatt_merge (this, &local->postparent, postparent, prev->this); - - if (prev->this == dht_first_up_subvol (this)) { - local->ia_ino = local->stbuf.ia_ino; - } - } unlock: UNLOCK (&frame->lock); @@ -3986,7 +4116,8 @@ int dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; int ret = -1; @@ -4004,8 +4135,8 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, conf = this->private; hashed_subvol = local->hashed_subvol; - if (uuid_is_null (local->loc.inode->gfid) && !op_ret) - memcpy (local->loc.inode->gfid, stbuf->ia_gfid, 16); + if (uuid_is_null (local->loc.gfid) && !op_ret) + uuid_copy (local->loc.gfid, stbuf->ia_gfid); if (dht_is_subvol_filled (this, hashed_subvol)) ret = dht_layout_merge (this, layout, prev->this, @@ -4014,6 +4145,12 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, NULL); + /* TODO: we may have to return from the function + if layout merge fails. For now, lets just log an error */ + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); + if (op_ret == -1) { local->op_errno = op_errno; goto err; @@ -4024,10 +4161,10 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, dht_iatt_merge (this, &local->preparent, preparent, prev->this); dht_iatt_merge (this, &local->postparent, postparent, prev->this); - local->ia_ino = local->stbuf.ia_ino; - local->call_cnt = conf->subvolume_cnt - 1; + if (uuid_is_null (local->loc.gfid)) + uuid_copy (local->loc.gfid, stbuf->ia_gfid); if (local->call_cnt == 0) { dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, &local->loc, layout); @@ -4037,24 +4174,24 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, continue; STACK_WIND (frame, dht_mkdir_cbk, conf->subvolumes[i], - conf->subvolumes[i]->fops->mkdir, - &local->loc, local->mode, local->params); + conf->subvolumes[i]->fops->mkdir, &local->loc, + local->mode, local->umask, local->params); } return 0; err: - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } -int + int dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; int op_errno = -1; - int ret = -1; xlator_t *hashed_subvol = NULL; @@ -4069,15 +4206,13 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_MKDIR); if (!local) { - op_errno = ENOMEM; goto err; } hashed_subvol = dht_subvol_get_hashed (this, loc); - if (hashed_subvol == NULL) { gf_log (this->name, GF_LOG_DEBUG, "hashed subvol not found for %s", @@ -4087,21 +4222,13 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, } local->hashed_subvol = hashed_subvol; - local->inode = inode_ref (loc->inode); - ret = loc_copy (&local->loc, loc); local->mode = mode; - - if (ret == -1) { - - op_errno = ENOMEM; - goto err; - } - + local->umask = umask; local->params = dict_ref (params); + local->inode = inode_ref (loc->inode); local->layout = dht_layout_new (this, conf->subvolume_cnt); if (!local->layout) { - op_errno = ENOMEM; goto err; } @@ -4109,13 +4236,14 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol->fops->mkdir, - loc, mode, params); + loc, mode, umask, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -4123,19 +4251,87 @@ err: int dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; - if (local->loc.parent) { - local->preparent.ia_ino = local->loc.parent->ino; - local->postparent.ia_ino = local->loc.parent->ino; + DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); + + return 0; +} + + +int +dht_rmdir_hashed_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + if (op_errno != ENOENT && op_errno != EACCES) { + local->need_selfheal = 1; + } + + + gf_log (this->name, GF_LOG_DEBUG, + "rmdir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto unlock; + } + + dht_iatt_merge (this, &local->preparent, preparent, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + } +unlock: + UNLOCK (&frame->lock); - DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->need_selfheal) { + local->layout = + dht_layout_get (this, local->loc.inode); + + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; + + uuid_copy (local->gfid, local->loc.inode->gfid); + dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, + &local->loc, local->layout); + } else { + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->preparent, + 0); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->postparent, + 1); + } + + DHT_STACK_UNWIND (rmdir, frame, local->op_ret, + local->op_errno, &local->preparent, + &local->postparent, NULL); + } + } return 0; } @@ -4144,11 +4340,12 @@ dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; + int done = 0; local = frame->local; prev = cookie; @@ -4159,8 +4356,9 @@ dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; local->op_ret = -1; - if (op_errno != ENOENT) + if (op_errno != ENOENT && op_errno != EACCES) { local->need_selfheal = 1; + } gf_log (this->name, GF_LOG_DEBUG, "rmdir on %s for %s failed (%s)", @@ -4169,6 +4367,8 @@ dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } + /* Track if rmdir succeeded on atleast one subvol*/ + local->fop_succeeded = 1; dht_iatt_merge (this, &local->preparent, preparent, prev->this); dht_iatt_merge (this, &local->postparent, postparent, prev->this); @@ -4178,30 +4378,54 @@ unlock: this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { + + /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */ + if (local->hashed_subvol && (this_call_cnt == 1)) { + done = 1; + } else if (!local->hashed_subvol && !this_call_cnt) { + done = 1; + } + + + if (done) { + if (local->need_selfheal && local->fop_succeeded) { local->layout = dht_layout_get (this, local->loc.inode); /* TODO: neater interface needed below */ local->stbuf.ia_type = local->loc.inode->ia_type; + uuid_copy (local->gfid, local->loc.inode->gfid); dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, &local->loc, local->layout); - } else { + } else if (this_call_cnt) { + /* If non-hashed subvol's have responded, proceed */ + + local->need_selfheal = 0; + STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, + local->hashed_subvol, + local->hashed_subvol->fops->rmdir, + &local->loc, local->flags, NULL); + } else if (!this_call_cnt) { + /* All subvol's have responded, proceed */ + if (local->loc.parent) { - local->preparent.ia_ino = - local->loc.parent->ino; - local->postparent.ia_ino = - local->loc.parent->ino; - WIPE (&local->preparent); - WIPE (&local->postparent); + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->preparent, + 0); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->postparent, + 1); + } DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, &local->preparent, - &local->postparent); + &local->postparent, NULL); } } @@ -4215,6 +4439,7 @@ dht_rmdir_do (call_frame_t *frame, xlator_t *this) dht_local_t *local = NULL; dht_conf_t *conf = NULL; int i = 0; + xlator_t *hashed_subvol = NULL; VALIDATE_OR_GOTO (this->private, err); @@ -4226,18 +4451,41 @@ dht_rmdir_do (call_frame_t *frame, xlator_t *this) local->call_cnt = conf->subvolume_cnt; + /* first remove from non-hashed_subvol */ + hashed_subvol = dht_subvol_get_hashed (this, &local->loc); + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_WARNING, "failed to get hashed " + "subvol for %s",local->loc.path); + } else { + local->hashed_subvol = hashed_subvol; + } + + /* When DHT has only 1 child */ + if (conf->subvolume_cnt == 1) { + STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, + conf->subvolumes[0], + conf->subvolumes[0]->fops->rmdir, + &local->loc, local->flags, NULL); + return 0; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (hashed_subvol && + (hashed_subvol == conf->subvolumes[i])) + continue; + STACK_WIND (frame, dht_rmdir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->rmdir, - &local->loc, local->flags); + &local->loc, local->flags, NULL); } return 0; err: DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); return 0; } @@ -4245,7 +4493,7 @@ err: int dht_rmdir_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -4293,6 +4541,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *main_frame = NULL; dht_local_t *main_local = NULL; int this_call_cnt = 0; + dht_conf_t *conf = this->private; local = frame->local; prev = cookie; @@ -4304,7 +4553,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != 0) goto err; - if (check_is_linkfile (inode, stbuf, xattr) == 0) { + if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { main_local->op_ret = -1; main_local->op_errno = ENOTEMPTY; @@ -4315,7 +4564,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } STACK_WIND (frame, dht_rmdir_linkfile_unlink_cbk, - src, src->fops->unlink, &local->loc); + src, src->fops->unlink, &local->loc, 0, NULL); return 0; err: @@ -4338,6 +4587,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, call_frame_t *lookup_frame = NULL; dht_local_t *lookup_local = NULL; dht_local_t *local = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = this->private; local = frame->local; @@ -4346,7 +4597,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, continue; if (strcmp (trav->d_name, "..") == 0) continue; - if (check_is_linkfile (NULL, (&trav->d_stat), NULL) == 1) { + if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict, + conf->link_xattr_name)) { ret++; continue; } @@ -4358,6 +4610,21 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, return 0; } + xattrs = dict_new (); + if (!xattrs) { + gf_log (this->name, GF_LOG_ERROR, "dict_new failed"); + return -1; + } + + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key" + " in dict"); + if (xattrs) + dict_unref (xattrs); + return -1; + } + list_for_each_entry (trav, &entries->list, list) { if (strcmp (trav->d_name, ".") == 0) continue; @@ -4374,8 +4641,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, goto err; } - lookup_local = GF_CALLOC (sizeof (*local), 1, - gf_dht_mt_dht_local_t); + lookup_local = mem_get0 (this->local_pool); if (!lookup_local) { goto err; } @@ -4388,6 +4654,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, if (build_ret != 0) goto err; + uuid_copy (lookup_local->loc.gfid, trav->d_stat.ia_gfid); + gf_log (this->name, GF_LOG_TRACE, "looking up %s on %s", lookup_local->loc.path, src->name); @@ -4400,12 +4668,18 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk, src, src->fops->lookup, - &lookup_local->loc, NULL); + &lookup_local->loc, xattrs); ret++; } + if (xattrs) + dict_unref (xattrs); + return ret; err: + if (xattrs) + dict_unref (xattrs); + DHT_STACK_DESTROY (lookup_frame); return 0; } @@ -4413,7 +4687,8 @@ err: int dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -4457,12 +4732,14 @@ dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; call_frame_t *prev = NULL; - + dict_t *dict = NULL; + int ret = 0; + dht_conf_t *conf = this->private; local = frame->local; prev = cookie; @@ -4472,12 +4749,32 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "opendir on %s for %s failed (%s)", prev->this->name, local->loc.path, strerror (op_errno)); + if (op_errno != ENOENT) { + local->op_ret = -1; + local->op_errno = op_errno; + } goto err; } + dict = dict_new (); + if (!dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + + ret = dict_set_uint32 (dict, conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + local->loc.path, conf->link_xattr_name); + STACK_WIND (frame, dht_rmdir_readdirp_cbk, prev->this, prev->this->fops->readdirp, - local->fd, 4096, 0); + local->fd, 4096, 0, dict); + + if (dict) + dict_unref (dict); return 0; @@ -4493,14 +4790,13 @@ err: int -dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) +dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; int op_errno = -1; int i = -1; - int ret = -1; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -4511,22 +4807,15 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) conf = this->private; - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_RMDIR); if (!local) { - op_errno = ENOMEM; goto err; } local->call_cnt = conf->subvolume_cnt; local->op_ret = 0; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - - op_errno = ENOMEM; - goto err; - } + local->fop_succeeded = 0; local->flags = flags; @@ -4541,7 +4830,7 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) STACK_WIND (frame, dht_rmdir_opendir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - loc, local->fd); + loc, local->fd, NULL); } return 0; @@ -4549,240 +4838,42 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (rmdir, frame, -1, op_errno, - NULL, NULL); - - return 0; -} + NULL, NULL, NULL); - -int -dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict); return 0; } - int -dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } - - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; - - STACK_WIND (frame, - dht_xattrop_cbk, - subvol, subvol->fops->xattrop, - loc, flags, dict); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL); - - return 0; -} - - -int -dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict); - return 0; -} - +dht_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) -int -dht_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict) { - xlator_t *subvol = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, - dht_fxattrop_cbk, - subvol, subvol->fops->fxattrop, - fd, flags, dict); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL); - + DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno, xdata); return 0; } int -dht_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno); - return 0; -} - - -int32_t -dht_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) +dht_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK); if (!local) { op_errno = ENOMEM; - goto err; } - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; - - STACK_WIND (frame, - dht_inodelk_cbk, - subvol, subvol->fops->inodelk, - volume, loc, cmd, lock); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (inodelk, frame, -1, op_errno); - - return 0; -} - - -int -dht_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - return 0; -} - - -int -dht_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - - STACK_WIND (frame, - dht_finodelk_cbk, - subvol, subvol->fops->finodelk, - volume, fd, cmd, lock); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (finodelk, frame, -1, op_errno); - - return 0; -} - - -int -dht_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno); - return 0; -} - - -int -dht_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = dht_subvol_get_cached (this, loc->inode); + subvol = local->cached_subvol; if (!subvol) { gf_log (this->name, GF_LOG_DEBUG, "no cached subvolume for path=%s", loc->path); @@ -4790,25 +4881,17 @@ dht_entrylk (call_frame_t *frame, xlator_t *this, goto err; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } - - local->inode = inode_ref (loc->inode); local->call_cnt = 1; STACK_WIND (frame, dht_entrylk_cbk, subvol, subvol->fops->entrylk, - volume, loc, basename, cmd, type); + volume, loc, basename, cmd, type, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (entrylk, frame, -1, op_errno); + DHT_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); return 0; } @@ -4816,10 +4899,10 @@ err: int dht_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, NULL); return 0; } @@ -4827,7 +4910,7 @@ dht_fentrylk_cbk (call_frame_t *frame, void *cookie, int dht_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -4846,173 +4929,13 @@ dht_fentrylk (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_fentrylk_cbk, subvol, subvol->fops->fentrylk, - volume, fd, basename, cmd, type); + volume, fd, basename, cmd, type, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno); - - return 0; -} - - -int -dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } - - dht_iatt_merge (this, &local->prebuf, statpre, prev->this); - dht_iatt_merge (this, &local->stbuf, statpost, prev->this); - - if (local->inode) { - local->prebuf.ia_ino = local->inode->ino; - local->stbuf.ia_ino = local->inode->ino; - } - - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, - &local->prebuf, &local->stbuf); - - return 0; -} - - -int -dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) -{ - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_DEBUG, - "memory allocation failed :("); - goto err; - } - - local->layout = layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - local->inode = inode_ref (loc->inode); - local->call_cnt = layout->cnt; - - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_setattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setattr, - loc, stbuf, valid); - } - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int -dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid) -{ - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } - - local->layout = layout = dht_layout_get (this, fd->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - local->inode = inode_ref (fd->inode); - local->call_cnt = layout->cnt; - - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_setattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->fsetattr, - fd, stbuf, valid); - } - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); return 0; } @@ -5021,59 +4944,22 @@ err: int dht_forget (xlator_t *this, inode_t *inode) { - uint64_t tmp_layout = 0; + uint64_t ctx_int = 0; + dht_inode_ctx_t *ctx = NULL; dht_layout_t *layout = NULL; - inode_ctx_get (inode, this, &tmp_layout); + inode_ctx_del (inode, this, &ctx_int); - if (!tmp_layout) + if (!ctx_int) return 0; - layout = (dht_layout_t *)(long)tmp_layout; - dht_layout_unref (this, layout); - - return 0; -} - - - -int -dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) -{ - xlator_list_t *subvols = NULL; - int cnt = 0; - - if (!conf) - return -1; - - for (subvols = this->children; subvols; subvols = subvols->next) - cnt++; + ctx = (dht_inode_ctx_t *) (long) ctx_int; - conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *), - gf_dht_mt_xlator_t); - if (!conf->subvolumes) { - - return -1; - } - conf->subvolume_cnt = cnt; - - cnt = 0; - for (subvols = this->children; subvols; subvols = subvols->next) - conf->subvolumes[cnt++] = subvols->xlator; - - conf->subvolume_status = GF_CALLOC (cnt, sizeof (char), - gf_dht_mt_char); - if (!conf->subvolume_status) { - - return -1; - } - - conf->last_event = GF_CALLOC (cnt, sizeof (int), - gf_dht_mt_char); - if (!conf->last_event) { + layout = ctx->layout; + ctx->layout = NULL; + dht_layout_unref (this, layout); + GF_FREE (ctx); - return -1; - } return 0; } @@ -5081,15 +4967,21 @@ dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) int dht_notify (xlator_t *this, int event, void *data, ...) { - xlator_t *subvol = NULL; - int cnt = -1; - int i = -1; - dht_conf_t *conf = NULL; - int ret = -1; - int propagate = 0; - - int had_heard_from_all = 0; - int have_heard_from_all = 0; + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + int propagate = 0; + + int had_heard_from_all = 0; + int have_heard_from_all = 0; + struct timeval time = {0,}; + gf_defrag_info_t *defrag = NULL; + dict_t *dict = NULL; + gf_defrag_type cmd = 0; + dict_t *output = NULL; + va_list ap; conf = this->private; @@ -5124,10 +5016,12 @@ dht_notify (xlator_t *this, int event, void *data, ...) break; } + gettimeofday (&time, NULL); LOCK (&conf->subvolume_lock); { conf->subvolume_status[cnt] = 1; conf->last_event[cnt] = event; + conf->subvol_up_time[cnt] = time.tv_sec; } UNLOCK (&conf->subvolume_lock); @@ -5140,12 +5034,23 @@ dht_notify (xlator_t *this, int event, void *data, ...) subvol = data; conf->gen++; + propagate = 1; break; case GF_EVENT_CHILD_DOWN: subvol = data; + if (conf->assert_no_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "Received CHILD_DOWN. Exiting"); + if (conf->defrag) { + gf_defrag_stop (conf->defrag, NULL); + } else { + kill (getpid(), SIGTERM); + } + } + for (i = 0; i < conf->subvolume_cnt; i++) { if (subvol == conf->subvolumes[i]) { cnt = i; @@ -5164,6 +5069,7 @@ dht_notify (xlator_t *this, int event, void *data, ...) { conf->subvolume_status[cnt] = 0; conf->last_event[cnt] = event; + conf->subvol_up_time[cnt] = 0; } UNLOCK (&conf->subvolume_lock); @@ -5193,6 +5099,36 @@ dht_notify (xlator_t *this, int event, void *data, ...) UNLOCK (&conf->subvolume_lock); break; + case GF_EVENT_VOLUME_DEFRAG: + { + if (!conf->defrag) { + return ret; + } + defrag = conf->defrag; + + dict = data; + va_start (ap, data); + output = va_arg (ap, dict_t*); + + ret = dict_get_int32 (dict, "rebalance-command", + (int32_t*)&cmd); + if (ret) + return ret; + LOCK (&defrag->lock); + { + if (defrag->is_exiting) + goto unlock; + if (cmd == GF_DEFRAG_CMD_STATUS) + gf_defrag_status_get (defrag, output); + else if (cmd == GF_DEFRAG_CMD_STOP) + gf_defrag_stop (defrag, output); + } +unlock: + UNLOCK (&defrag->lock); + return 0; + break; + } + default: propagate = 1; break; @@ -5208,9 +5144,12 @@ dht_notify (xlator_t *this, int event, void *data, ...) /* if all subvols have reported status, no need to hide anything or wait for anything else. Just propagate blindly */ - if (have_heard_from_all) + if (have_heard_from_all) { propagate = 1; + } + + if (!had_heard_from_all && have_heard_from_all) { /* This is the first event which completes aggregation of events from all subvolumes. If at least one subvol @@ -5229,10 +5168,45 @@ dht_notify (xlator_t *this, int event, void *data, ...) /* continue to check other events for CHILD_UP */ } } + + /* rebalance is started with assert_no_child_down. So we do + * not need to handle CHILD_DOWN event here. + */ + if (conf->defrag) { + ret = gf_thread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); + if (ret) { + conf->defrag = NULL; + GF_FREE (conf->defrag); + kill (getpid(), SIGTERM); + } + } } - if (propagate || event == GF_EVENT_CHILD_MODIFIED) + ret = 0; + if (propagate) ret = default_notify (this, event, data); return ret; } + +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (!ret && ctx) { + if (ctx->layout) { + if (layout) + *layout = ctx->layout; + ret = 0; + } else { + ret = -1; + } + } + + return ret; +} diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9c39d0d63..5ccd66799 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -22,47 +13,90 @@ #include "config.h" #endif +#include <regex.h> + #include "dht-mem-types.h" #include "libxlator.h" +#include "syncop.h" #ifndef _DHT_H #define _DHT_H -#define GF_XATTR_FIX_LAYOUT_KEY "trusted.distribute.fix.layout" +#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout" #define GF_DHT_LOOKUP_UNHASHED_ON 1 #define GF_DHT_LOOKUP_UNHASHED_AUTO 2 +#define DHT_PATHINFO_HEADER "DISTRIBUTE:" #include <fnmatch.h> typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno); + xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata); +typedef int (*dht_defrag_cbk_fn_t) (xlator_t *this, call_frame_t *frame, + int ret); struct dht_layout { - int cnt; - int preset; - int gen; - int type; - int ref; /* use with dht_conf_t->layout_lock */ - int search_unhashed; + int spread_cnt; /* layout spread count per directory, + is controlled by 'setxattr()' with + special key */ + int cnt; + int preset; + int gen; + int type; + int ref; /* use with dht_conf_t->layout_lock */ + int search_unhashed; struct { - int err; /* 0 = normal - -1 = dir exists and no xattr - >0 = dir lookup failed with errno - */ - uint32_t start; - uint32_t stop; - xlator_t *xlator; - } list[0]; + int err; /* 0 = normal + -1 = dir exists and no xattr + >0 = dir lookup failed with errno + */ + uint32_t start; + uint32_t stop; + xlator_t *xlator; + } list[]; +}; +typedef struct dht_layout dht_layout_t; + +struct dht_stat_time { + uint32_t atime; + uint32_t atime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; + uint32_t mtime; + uint32_t mtime_nsec; +}; + +typedef struct dht_stat_time dht_stat_time_t; + +struct dht_inode_ctx { + dht_layout_t *layout; + dht_stat_time_t time; }; -typedef struct dht_layout dht_layout_t; + +typedef struct dht_inode_ctx dht_inode_ctx_t; typedef enum { DHT_HASH_TYPE_DM, + DHT_HASH_TYPE_DM_USER, } dht_hashfn_type_t; +/* rebalance related */ +struct dht_rebalance_ { + xlator_t *from_subvol; + xlator_t *target_node; + off_t offset; + size_t size; + int32_t flags; + int count; + struct iobref *iobref; + struct iovec *vector; + struct iatt stbuf; + dht_defrag_cbk_fn_t target_op_fn; + dict_t *xdata; +}; struct dht_local { int call_cnt; @@ -96,6 +130,7 @@ struct dht_local { int file_count; int dir_count; call_frame_t *main_frame; + int fop_succeeded; struct { fop_mknod_cbk_t linkfile_cbk; struct iatt stbuf; @@ -107,7 +142,6 @@ struct dht_local { struct { uint32_t hole_cnt; uint32_t overlaps_cnt; - uint32_t missing; uint32_t down; uint32_t misc; dht_selfheal_dir_cbk_t dir_cbk; @@ -120,11 +154,16 @@ struct dht_local { int32_t flags; mode_t mode; dev_t rdev; + mode_t umask; /* need for file-info */ - char *pathinfo; + char *xattr_val; char *key; + /* which xattr request? */ + char xsel[256]; + int32_t alloc_len; + char *newpath; /* gfid related */ @@ -132,17 +171,85 @@ struct dht_local { /*Marker Related*/ struct marker_str marker; + + /* flag used to make sure we need to return estale in + {lookup,revalidate}_cbk */ + char return_estale; + char need_lookup_everywhere; + + glusterfs_fop_t fop; + + gf_boolean_t linked; + xlator_t *link_subvol; + + struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; + }; typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { double avail_percent; + double avail_inodes; uint64_t avail_space; uint32_t log; }; typedef struct dht_du dht_du_t; +enum gf_defrag_type { + GF_DEFRAG_CMD_START = 1, + GF_DEFRAG_CMD_STOP = 1 + 1, + GF_DEFRAG_CMD_STATUS = 1 + 2, + GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, + GF_DEFRAG_CMD_START_FORCE = 1 + 4, +}; +typedef enum gf_defrag_type gf_defrag_type; + +enum gf_defrag_status_t { + GF_DEFRAG_STATUS_NOT_STARTED, + GF_DEFRAG_STATUS_STARTED, + GF_DEFRAG_STATUS_STOPPED, + GF_DEFRAG_STATUS_COMPLETE, + GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, +}; +typedef enum gf_defrag_status_t gf_defrag_status_t; + +typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t; + +struct gf_defrag_pattern_list { + char path_pattern[256]; + uint64_t size; + gf_defrag_pattern_list_t *next; +}; + +struct gf_defrag_info_ { + uint64_t total_files; + uint64_t total_data; + uint64_t num_files_lookedup; + uint64_t total_failures; + uint64_t skipped; + gf_lock_t lock; + int cmd; + pthread_t th; + gf_defrag_status_t defrag_status; + struct rpc_clnt *rpc; + uint32_t connected; + uint32_t is_exiting; + pid_t pid; + inode_t *root_inode; + uuid_t node_uuid; + struct timeval start_time; + gf_boolean_t stats; + gf_defrag_pattern_list_t *defrag_pattern; +}; + +typedef struct gf_defrag_info_ gf_defrag_info_t; + struct dht_conf { gf_lock_t subvolume_lock; int subvolume_cnt; @@ -151,11 +258,11 @@ struct dht_conf { int *last_event; dht_layout_t **file_layouts; dht_layout_t **dir_layouts; - dht_layout_t *default_dir_layout; gf_boolean_t search_unhashed; int gen; dht_du_t *du_stats; - uint64_t min_free_disk; + double min_free_disk; + double min_free_inodes; char disk_unit; int32_t refresh_interval; gf_boolean_t unhashed_sticky_bit; @@ -165,6 +272,35 @@ struct dht_conf { dht */ gf_boolean_t use_readdirp; char vol_uuid[UUID_SIZE + 1]; + gf_boolean_t assert_no_child_down; + time_t *subvol_up_time; + + /* This is the count used as the distribute layout for a directory */ + /* Will be a global flag to control the layout spread count */ + uint32_t dir_spread_cnt; + + /* to keep track of nodes which are decomissioned */ + xlator_t **decommissioned_bricks; + int decommission_in_progress; + int decommission_subvols_cnt; + + /* defrag related */ + gf_defrag_info_t *defrag; + + /* Request to filter directory entries in readdir request */ + + gf_boolean_t readdir_optimize; + + /* Support regex-based name reinterpretation. */ + regex_t rsync_regex; + gf_boolean_t rsync_regex_valid; + regex_t extra_regex; + gf_boolean_t extra_regex_valid; + + /* Support variable xattr names. */ + char *xattr_name; + char *link_xattr_name; + char *wild_xattr_name; }; typedef struct dht_conf dht_conf_t; @@ -179,21 +315,45 @@ struct dht_disk_layout { }; typedef struct dht_disk_layout dht_disk_layout_t; -#define WIPE(statp) do { typeof(*statp) z = {0,}; if (statp) *statp = z; } while (0) +typedef enum { + GF_DHT_MIGRATE_DATA, + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS, + GF_DHT_MIGRATE_HARDLINK, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS +} gf_dht_migrate_data_type_t; #define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) -#define is_fs_root(loc) (strcmp (loc->path, "/") == 0) - -#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0) +#define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0) #define is_last_call(cnt) (cnt == 0) -#define DHT_LINKFILE_MODE (S_ISVTX) -#define check_is_linkfile(i,s,x) ( \ - ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) \ - == DHT_LINKFILE_MODE) && \ - (s->ia_size == 0)) +#define DHT_MIGRATION_IN_PROGRESS 1 +#define DHT_MIGRATION_COMPLETED 2 + +#define DHT_LINKFILE_MODE (S_ISVTX) + +#define check_is_linkfile(i,s,x,n) ( \ + ((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \ + == DHT_LINKFILE_MODE) && \ + dict_get (x, n)) + +#define IS_DHT_MIGRATION_PHASE2(buf) ( \ + IA_ISREG ((buf)->ia_type) && \ + ((st_mode_from_ia ((buf)->ia_prot, (buf)->ia_type) & \ + ~S_IFMT) == DHT_LINKFILE_MODE)) + +#define IS_DHT_MIGRATION_PHASE1(buf) ( \ + IA_ISREG ((buf)->ia_type) && \ + ((buf)->ia_prot.sticky == 1) && \ + ((buf)->ia_prot.sgid == 1)) + +#define DHT_STRIP_PHASE1_FLAGS(buf) do { \ + if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \ + (buf)->ia_prot.sticky = 0; \ + (buf)->ia_prot.sgid = 0; \ + } \ + } while (0) #define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type)) @@ -201,107 +361,427 @@ typedef struct dht_disk_layout dht_disk_layout_t; #define DHT_STACK_UNWIND(fop, frame, params ...) do { \ dht_local_t *__local = NULL; \ - xlator_t *__xl = NULL; \ + xlator_t *__xl = NULL; \ if (frame) { \ - __xl = frame->this; \ - __local = frame->local; \ + __xl = frame->this; \ + __local = frame->local; \ frame->local = NULL; \ } \ STACK_UNWIND_STRICT (fop, frame, params); \ dht_local_wipe (__xl, __local); \ } while (0) -#define DHT_STACK_DESTROY(frame) do { \ - dht_local_t *__local = NULL; \ - xlator_t *__xl = NULL; \ - __xl = frame->this; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - dht_local_wipe (__xl, __local); \ +#define DHT_STACK_DESTROY(frame) do { \ + dht_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + dht_local_wipe (__xl, __local); \ + } while (0) + +#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, inode, post) do {\ + int32_t sec = 0; \ + sec = new_sec; \ + LOCK (&inode->lock); \ + { \ + new_sec = max(new_sec, ctx_sec); \ + if (sec < new_sec) \ + new_nsec = ctx_nsec; \ + if (sec == new_sec) \ + new_nsec = max (new_nsec, ctx_nsec); \ + if (post) { \ + ctx_sec = new_sec; \ + ctx_nsec = new_nsec; \ + } \ + } \ + UNLOCK (&inode->lock); \ } while (0) -dht_layout_t *dht_layout_new (xlator_t *this, int cnt); -dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); -dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); -xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, +#define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) +dht_layout_t *dht_layout_new (xlator_t *this, int cnt); +dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); +dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name); -int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); -int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, - uint32_t *misc_p); -int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, - xlator_t *subvol, loc_t *loc, dict_t *xattr); +int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); +int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, + uint32_t *misc_p, uint32_t *no_space_p); +int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, loc_t *loc, dict_t *xattr); xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *buf, dict_t *xattr); -int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, - xlator_t *subvol, loc_t *loc); +int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc); int dht_layouts_init (xlator_t *this, dht_conf_t *conf); int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - int op_ret, int op_errno, dict_t *xattr); + int op_ret, int op_errno, dict_t *xattr); int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, - int pos, int32_t **disk_layout_p); -int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw); + int pos, int32_t **disk_layout_p); +int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, void *disk_layout_raw, int disk_layout_len); int dht_frame_return (call_frame_t *frame); -int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); +int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol, uint64_t *x); void dht_local_wipe (xlator_t *this, dht_local_t *local); -dht_local_t *dht_local_init (call_frame_t *frame); -int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from, - xlator_t *subvol); +dht_local_t *dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, + glusterfs_fop_t fop); +int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from, + xlator_t *subvol); xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); -int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); +xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev); +int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); -int dht_hash_compute (int type, const char *name, uint32_t *hash_p); +int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p); -int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc); -int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); +int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, + xlator_t *fromvol, loc_t *loc); +int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); +int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); int -dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - loc_t *loc, dht_layout_t *layout); +dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); int dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, dht_layout_t *layout); int -dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - loc_t *loc, dht_layout_t *layout); +dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); int dht_layout_sort_volname (dht_layout_t *layout); -int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); - int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); -int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); +gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); +xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *layout); +int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); -int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout); -void dht_layout_unref (xlator_t *this, dht_layout_t *layout); +int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);; +void dht_layout_unref (xlator_t *this, dht_layout_t *layout); dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); -xlator_t *dht_first_up_subvol (xlator_t *this); -xlator_t *dht_last_up_subvol (xlator_t *this); +xlator_t *dht_first_up_subvol (xlator_t *this); +xlator_t *dht_last_up_subvol (xlator_t *this); int dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); -int dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, +int dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, xlator_t **subvol); -#endif /* _DHT_H */ +int dht_rename_cleanup (call_frame_t *frame); +int dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); + +int dht_fix_directory_layout (call_frame_t *frame, + dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout); + +int dht_init_subvolumes (xlator_t *this, dht_conf_t *conf); + +/* migration/rebalance */ +int dht_start_rebalance_task (xlator_t *this, call_frame_t *frame); + +int dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame); +int dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame); + + +/* FOPS */ +int32_t dht_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req); + +int32_t dht_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc, dict_t *xdata); + +int32_t dht_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd, dict_t *xdata); + +int32_t dht_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset, dict_t *xdata); + +int32_t dht_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset, dict_t *xdata); + +int32_t dht_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask, dict_t *xdata); + +int32_t dht_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size, dict_t *xdata); + +int32_t dht_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata); + +int32_t dht_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); + +int32_t dht_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, int xflag, dict_t *xdata); + +int32_t dht_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, dict_t *xdata); + +int32_t dht_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *loc, mode_t umask, + dict_t *xdata); + +int32_t dht_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc, dict_t *xdata); + +int32_t dht_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc, dict_t *xdata); + +int32_t dht_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *params); + +int32_t dht_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata); + +int32_t dht_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int32_t dht_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset, + uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +int32_t dht_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd, dict_t *xdata); + +int32_t dht_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync, dict_t *xdata); + +int32_t dht_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd, dict_t *xdata); + +int32_t dht_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync, dict_t *xdata); + +int32_t dht_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc, dict_t *xdata); + +int32_t dht_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t dht_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name, dict_t *xdata); + +int32_t dht_fsetxattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t dht_fgetxattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + const char *name, dict_t *xdata); + +int32_t dht_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name, dict_t *xdata); +int32_t dht_fremovexattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + const char *name, dict_t *xdata); + +int32_t dht_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +int32_t dht_inodelk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +int32_t dht_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +int32_t dht_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +int32_t dht_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +int32_t dht_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, off_t off, dict_t *xdata); + +int32_t dht_readdirp (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, off_t off, dict_t *dict); + +int32_t dht_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict, dict_t *xdata); + +int32_t dht_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict, dict_t *xdata); + +int32_t dht_forget (xlator_t *this, inode_t *inode); +int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata); +int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); +int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); + +int32_t dht_init (xlator_t *this); +void dht_fini (xlator_t *this); +int dht_reconfigure (xlator_t *this, dict_t *options); +int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...); + +/* definitions for nufa/switch */ +int dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); +int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict); + +int +gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output); + +void* +gf_defrag_start (void *this); + +int32_t +gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, + struct iatt *stbuf); +int +dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag); +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, + dht_layout_t **layout_int); +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t* layout_int); +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t update_ctx); + +int dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx); +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx); +int +dht_dir_attr_heal (void *data); +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data); +int +dht_dir_has_layout (dict_t *xattr, char *name); +gf_boolean_t +dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); +xlator_t * +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +xlator_t * +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this); + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix); +int32_t +dht_priv_dump (xlator_t *this); +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode); + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); + +#endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 75953781e..fe3955ecb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -35,224 +26,389 @@ int dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) + int op_ret, int op_errno, struct statvfs *statvfs, + dict_t *xdata) { - dht_conf_t *conf = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; - int i = 0; - double percent = 0; - uint64_t bytes = 0; - - conf = this->private; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get disk info from %s", prev->this->name); - goto out; - } - - if (statvfs && statvfs->f_blocks) { - percent = (statvfs->f_bfree * 100) / statvfs->f_blocks; - bytes = (statvfs->f_bfree * statvfs->f_frsize); - } - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) - if (prev->this == conf->subvolumes[i]) { - conf->du_stats[i].avail_percent = percent; - conf->du_stats[i].avail_space = bytes; - gf_log (this->name, GF_LOG_DEBUG, - "on subvolume '%s': avail_percent is: " - "%.2f and avail_space is: %"PRIu64"", - prev->this->name, - conf->du_stats[i].avail_percent, - conf->du_stats[i].avail_space); - } - } - UNLOCK (&conf->subvolume_lock); + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + int i = 0; + double percent = 0; + double percent_inodes = 0; + uint64_t bytes = 0; + + conf = this->private; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "failed to get disk info from %s", prev->this->name); + goto out; + } + + if (statvfs && statvfs->f_blocks) { + percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; + bytes = (statvfs->f_bavail * statvfs->f_frsize); + } + + if (statvfs && statvfs->f_files) { + percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; + } else { + /* set percent inodes to 100 for dynamically allocated inode filesystems + this logic holds good so that, distribute has nothing to worry about + total inodes rather let the 'create()' to be scheduled on the hashed + subvol regardless of the total inodes. since we have no awareness on + loosing inodes this logic fits well + */ + percent_inodes = 100; + } + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) + if (prev->this == conf->subvolumes[i]) { + conf->du_stats[i].avail_percent = percent; + conf->du_stats[i].avail_space = bytes; + conf->du_stats[i].avail_inodes = percent_inodes; + gf_log (this->name, GF_LOG_DEBUG, + "on subvolume '%s': avail_percent is: " + "%.2f and avail_space is: %"PRIu64" " + "and avail_inodes is: %.2f", + prev->this->name, + conf->du_stats[i].avail_percent, + conf->du_stats[i].avail_space, + conf->du_stats[i].avail_inodes); + } + } + UNLOCK (&conf->subvolume_lock); out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_DESTROY (frame); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_DESTROY (frame); - return 0; + return 0; } int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx) { - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - call_pool_t *pool = NULL; - - conf = this->private; - pool = this->ctx->pool; - - statfs_frame = create_frame (this, pool); - if (!statfs_frame) { - goto err; - } - - statfs_local = dht_local_init (statfs_frame); - if (!statfs_local) { - goto err; - } - - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = 1; - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[subvol_idx], - conf->subvolumes[subvol_idx]->fops->statfs, - &tmp_loc); - - return 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + call_pool_t *pool = NULL; + loc_t tmp_loc = {0,}; + + conf = this->private; + pool = this->ctx->pool; + + statfs_frame = create_frame (this, pool); + if (!statfs_frame) { + goto err; + } + + /* local->fop value is not used in this case */ + statfs_local = dht_local_init (statfs_frame, NULL, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + /* make it root gfid, should be enough to get the proper info back */ + tmp_loc.gfid[15] = 1; + + statfs_local->call_cnt = 1; + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx]->fops->statfs, + &tmp_loc, NULL); + + return 0; err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + if (statfs_frame) + DHT_STACK_DESTROY (statfs_frame); - return -1; + return -1; } int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) { - int i = 0; - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - struct timeval tv = {0,}; + int i = 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + struct timeval tv = {0,}; + loc_t tmp_loc = {0,}; + + conf = this->private; + + gettimeofday (&tv, NULL); + + /* make it root gfid, should be enough to get the proper + info back */ + tmp_loc.gfid[15] = 1; + + if (tv.tv_sec > (conf->refresh_interval + + conf->last_stat_fetch.tv_sec)) { + + statfs_frame = copy_frame (frame); + if (!statfs_frame) { + goto err; + } + + /* In this case, 'local->fop' is not used */ + statfs_local = dht_local_init (statfs_frame, loc, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + statfs_local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, + &tmp_loc, NULL); + } + + conf->last_stat_fetch.tv_sec = tv.tv_sec; + } + return 0; +err: + if (statfs_frame) + DHT_STACK_DESTROY (statfs_frame); - conf = this->private; + return -1; +} - gettimeofday (&tv, NULL); - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - statfs_frame = copy_frame (frame); - if (!statfs_frame) { - goto err; - } +gf_boolean_t +dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) +{ + int i = 0; + dht_conf_t *conf = NULL; + gf_boolean_t subvol_filled_inodes = _gf_false; + gf_boolean_t subvol_filled_space = _gf_false; + gf_boolean_t is_subvol_filled = _gf_false; + + conf = this->private; + + /* Check for values above specified percent or free disk */ + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + if (conf->disk_unit == 'p') { + if (conf->du_stats[i].avail_percent < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + + } else { + if (conf->du_stats[i].avail_space < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + } + if (conf->du_stats[i].avail_inodes < + conf->min_free_inodes) { + subvol_filled_inodes = _gf_true; + break; + } + } + } + } + UNLOCK (&conf->subvolume_lock); + + if (subvol_filled_space && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_WARNING, + "disk space on subvolume '%s' is getting " + "full (%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_percent)); + } + } + + if (subvol_filled_inodes && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_CRITICAL, + "inodes on subvolume '%s' are at " + "(%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_inodes)); + } + } + + is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); + + return is_subvol_filled; +} + + +/*Get the best subvolume to create the file in*/ +xlator_t * +dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *local) +{ + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; - statfs_local = dht_local_init (statfs_frame); - if (!statfs_local) { - goto err; + conf = this->private; + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); + goto out; } + } else { + layout = dht_layout_ref (this, local->layout); + } - loc_copy (&statfs_local->loc, loc); - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, - &tmp_loc); + LOCK (&conf->subvolume_lock); + { + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + layout); + if(!avail_subvol) + { + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, + subvol, + layout); } - conf->last_stat_fetch.tv_sec = tv.tv_sec; - } - return 0; -err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + } + UNLOCK (&conf->subvolume_lock); +out: + if (!avail_subvol) { + gf_log (this->name, + GF_LOG_DEBUG, + "no subvolume has enough free space and/or inodes\ + to create"); + avail_subvol = subvol; + } - return -1; + if (layout) + dht_layout_unref (this, layout); + return avail_subvol; } +static inline +int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +{ + int ret = -1; + int i = 0; -int -dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) + if (!this || !layout) + goto out; + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; + } + } + ret = 0; +out: + return ret; +} + +/*Get subvolume which has both space and inodes more than the min criteria*/ +xlator_t * +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { - int i = 0; - int subvol_filled = 0; + int i = 0; + double max = 0; + double max_inodes = 0; + int ignore_subvol = 0; + + xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; conf = this->private; - /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } else { - if (conf->du_stats[i].avail_space < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } + for(i=0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if ((conf->disk_unit == 'p') && + (conf->du_stats[i].avail_percent > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_percent > max)) { + max = conf->du_stats[i].avail_percent; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; } } - } - UNLOCK (&conf->subvolume_lock); - - if (subvol_filled && conf->subvolume_status[i]) { - if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { - gf_log (this->name, GF_LOG_WARNING, - "disk space on subvolume '%s' is getting " - "full (%.2f %%), consider adding more nodes", - subvol->name, - (100 - conf->du_stats[i].avail_percent)); + + if ((conf->disk_unit != 'p') && + (conf->du_stats[i].avail_space > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_space > max)) { + max = conf->du_stats[i].avail_space; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } } } - return subvol_filled; + return avail_subvol; } + +/* Get subvol which has atleast one inode and maximum space */ xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { int i = 0; - double max= 0; + double max = 0; + int ignore_subvol = 0; + xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; conf = this->private; - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent > max) { - max = conf->du_stats[i].avail_percent; - avail_subvol = conf->subvolumes[i]; - } - } else { - if (conf->du_stats[i].avail_space > max) { - max = conf->du_stats[i].avail_space; - avail_subvol = conf->subvolumes[i]; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if (conf->disk_unit == 'p') { + if ((conf->du_stats[i].avail_percent > max) + && (conf->du_stats[i].avail_inodes > 0 )) { + max = conf->du_stats[i].avail_percent; + avail_subvol = conf->subvolumes[i]; } - } - } - UNLOCK (&conf->subvolume_lock); - - if (!avail_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume has enough free space to create"); + } else { + if ((conf->du_stats[i].avail_space > max) + && (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_space; + avail_subvol = conf->subvolumes[i]; + } + } } - if (max < conf->min_free_disk) - avail_subvol = subvol; - - if (!avail_subvol) - avail_subvol = subvol; - return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c index 99bb13265..656cf23a0 100644 --- a/xlators/cluster/dht/src/dht-hashfn.c +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -37,6 +28,7 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) switch (type) { case DHT_HASH_TYPE_DM: + case DHT_HASH_TYPE_DM_USER: hash = gf_dm_hashfn (name, strlen (name)); break; default: @@ -52,30 +44,68 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) } -#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ - rsync_frndly_name = (char *) name; \ - if (name[0] == '.') { \ - char *dot = 0; \ - int namelen = 0; \ - \ - dot = strrchr (name, '.'); \ - if (dot && dot > (name + 1) && *(dot + 1)) { \ - namelen = (dot - name); \ - rsync_frndly_name = alloca (namelen); \ - strncpy (rsync_frndly_name, name + 1, \ - namelen); \ - rsync_frndly_name[namelen - 1] = 0; \ - } \ - } \ - } while (0); +static inline +gf_boolean_t +dht_munge_name (const char *original, char *modified, size_t len, regex_t *re) +{ + regmatch_t matches[2]; + size_t new_len; + + if (regexec(re,original,2,matches,0) != REG_NOMATCH) { + if (matches[1].rm_so != -1) { + new_len = matches[1].rm_eo - matches[1].rm_so; + /* Equal would fail due to the NUL at the end. */ + if (new_len < len) { + memcpy (modified,original+matches[1].rm_so, + new_len); + modified[new_len] = '\0'; + return _gf_true; + } + } + } + /* This is guaranteed safe because of how the dest was allocated. */ + strcpy(modified,original); + return _gf_false; +} int -dht_hash_compute (int type, const char *name, uint32_t *hash_p) +dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) { - char *rsync_friendly_name = NULL; + char *rsync_friendly_name = NULL; + dht_conf_t *priv = this->private; + size_t len = 0; + gf_boolean_t munged = _gf_false; + + /* + * It wouldn't be safe to use alloca in an inline function that doesn't + * actually get inlined, and it wouldn't be efficient to do a real + * allocation, so we use alloca here (if needed) and pass that to the + * inline. + */ + + if (priv->extra_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->extra_regex); + } + + if (!munged && priv->rsync_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->rsync_regex); + if (munged) { + gf_log (this->name, GF_LOG_DEBUG, + "munged down to %s", rsync_friendly_name); + } + } - MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); + if (!munged) { + rsync_friendly_name = (char *)name; + } return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); } diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index cd57b9ea0..311a48112 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -27,6 +18,28 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) @@ -49,6 +62,43 @@ dht_frame_return (call_frame_t *frame) } +static uint64_t +dht_bits_for (uint64_t num) +{ + uint64_t bits = 0, ctrl = 1; + + while (ctrl < num) { + ctrl *= 2; + bits ++; + } + + return bits; +} + +/* + * A slightly "updated" version of the algorithm described in the commit log + * is used here. + * + * The only enhancement is that: + * + * - The number of bits used by the backend filesystem for HUGE d_off which + * is described as 63, and + * - The number of bits used by the d_off presented by the transformation + * upwards which is described as 64, are both made "configurable." + */ + + +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 + +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) + +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) + int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) { @@ -56,6 +106,9 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) int cnt = 0; int max = 0; uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; if (x == ((uint64_t) -1)) { y = (uint64_t) -1; @@ -69,7 +122,23 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) max = conf->subvolume_cnt; cnt = dht_subvol_cnt (this, subvol); - y = ((x * max) + cnt); + if (max == 1) { + y = x; + goto out; + } + + max_bits = dht_bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); + } out: if (y_p) @@ -89,7 +158,7 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, int ret = 0; /* not found */ /* Why do other tasks if first required 'char' itself is not there */ - if (loc->name && !strchr (loc->name, '@')) + if (!new_loc || !loc || !loc->name || !strchr (loc->name, '@')) goto out; trav = this->children; @@ -117,7 +186,6 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, new_loc->path = ((new_path) ? new_path: gf_strdup (loc->path)); new_loc->name = new_name; - new_loc->ino = loc->ino; new_loc->inode = inode_ref (loc->inode); new_loc->parent = inode_ref (loc->parent); } @@ -130,10 +198,8 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, out: if (!ret) { /* !success */ - if (new_path) - GF_FREE (new_path); - if (new_name) - GF_FREE (new_name); + GF_FREE (new_path); + GF_FREE (new_name); } return ret; } @@ -147,16 +213,38 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, int max = 0; uint64_t x = 0; xlator_t *subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; if (!this->private) - goto out; + return -1; conf = this->private; max = conf->subvolume_cnt; - cnt = y % max; - x = y / max; + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = dht_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } +out: subvol = conf->subvolumes[cnt]; if (subvol_p) @@ -165,7 +253,6 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, if (x_p) *x_p = x; -out: return 0; } @@ -216,58 +303,72 @@ dht_local_wipe (xlator_t *this, dht_local_t *local) local->selfheal.layout = NULL; } - if (local->newpath) { - GF_FREE (local->newpath); - } + GF_FREE (local->newpath); - if (local->key) { - GF_FREE (local->key); - } + GF_FREE (local->key); - GF_FREE (local); + GF_FREE (local->rebalance.vector); + + if (local->rebalance.iobref) + iobref_unref (local->rebalance.iobref); + + mem_put (local); } dht_local_t * -dht_local_init (call_frame_t *frame) +dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) { dht_local_t *local = NULL; + inode_t *inode = NULL; + int ret = 0; - /* TODO: use mem-pool */ - local = GF_CALLOC (1, sizeof (*local), - gf_dht_mt_dht_local_t); - + local = mem_get0 (THIS->local_pool); if (!local) - return NULL; + goto out; - local->op_ret = -1; - local->op_errno = EUCLEAN; + if (loc) { + ret = loc_copy (&local->loc, loc); + if (ret) + goto out; - frame->local = local; + inode = loc->inode; + } - return local; -} + if (fd) { + local->fd = fd_ref (fd); + if (!inode) + inode = fd->inode; + } + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop = fop; -char * -basestr (const char *str) -{ - char *basestr = NULL; + if (inode) { + local->layout = dht_layout_get (frame->this, inode); + local->cached_subvol = dht_subvol_get_cached (frame->this, + inode); + } - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; + frame->local = local; - return basestr; +out: + if (ret) { + if (local) + mem_put (local); + local = NULL; + } + return local; } - xlator_t * dht_first_up_subvol (xlator_t *this) { dht_conf_t *conf = NULL; xlator_t *child = NULL; int i = 0; + time_t time = 0; conf = this->private; if (!conf) @@ -276,9 +377,14 @@ dht_first_up_subvol (xlator_t *this) LOCK (&conf->subvolume_lock); { for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolume_status[i]) { - child = conf->subvolumes[i]; - break; + if (conf->subvol_up_time[i]) { + if (!time) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; + } else if (time > conf->subvol_up_time[i]) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; + } } } } @@ -320,17 +426,23 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc) dht_layout_t *layout = NULL; xlator_t *subvol = NULL; - if (is_fs_root (loc)) { + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + if (__is_root_gfid (loc->gfid)) { subvol = dht_first_up_subvol (this); goto out; } + GF_VALIDATE_OR_GOTO (this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO (this->name, loc->name, out); + layout = dht_layout_get (this, loc->parent); if (!layout) { gf_log (this->name, GF_LOG_DEBUG, - "layout missing path=%s parent=%"PRId64, - loc->path, loc->parent->ino); + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); goto out; } @@ -358,6 +470,8 @@ dht_subvol_get_cached (xlator_t *this, inode_t *inode) dht_layout_t *layout = NULL; xlator_t *subvol = NULL; + GF_VALIDATE_OR_GOTO (this->name, this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); layout = dht_layout_get (this, inode); @@ -399,7 +513,36 @@ out: return next; } +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } + +out: + return next; +} int dht_subvol_cnt (xlator_t *this, xlator_t *subvol) { @@ -428,6 +571,15 @@ out: (a) = (b); \ } while (0) + +#define set_if_greater_time(a, an, b, bn) do { \ + if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))){ \ + (a) = (b); \ + (an) = (bn); \ + } \ + } while (0) \ + + int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from, xlator_t *subvol) @@ -439,8 +591,7 @@ dht_iatt_merge (xlator_t *this, struct iatt *to, uuid_copy (to->ia_gfid, from->ia_gfid); - dht_itransform (this, subvol, from->ia_ino, &to->ia_ino); - + to->ia_ino = from->ia_ino; to->ia_prot = from->ia_prot; to->ia_type = from->ia_type; to->ia_nlink = from->ia_nlink; @@ -452,9 +603,12 @@ dht_iatt_merge (xlator_t *this, struct iatt *to, set_if_greater (to->ia_uid, from->ia_uid); set_if_greater (to->ia_gid, from->ia_gid); - set_if_greater (to->ia_atime, from->ia_atime); - set_if_greater (to->ia_mtime, from->ia_mtime); - set_if_greater (to->ia_ctime, from->ia_ctime); + set_if_greater_time(to->ia_atime, to->ia_atime_nsec, + from->ia_atime, from->ia_atime_nsec); + set_if_greater_time (to->ia_mtime, to->ia_mtime_nsec, + from->ia_mtime, from->ia_mtime_nsec); + set_if_greater_time (to->ia_ctime, to->ia_ctime_nsec, + from->ia_ctime, from->ia_ctime_nsec); return 0; } @@ -491,3 +645,503 @@ err: loc_wipe (child); return -1; } + + + +int +dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) +{ + xlator_list_t *subvols = NULL; + int cnt = 0; + + if (!conf) + return -1; + + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; + + conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->subvolumes) { + return -1; + } + conf->subvolume_cnt = cnt; + + cnt = 0; + for (subvols = this->children; subvols; subvols = subvols->next) + conf->subvolumes[cnt++] = subvols->xlator; + + conf->subvolume_status = GF_CALLOC (cnt, sizeof (char), + gf_dht_mt_char); + if (!conf->subvolume_status) { + return -1; + } + + conf->last_event = GF_CALLOC (cnt, sizeof (int), + gf_dht_mt_char); + if (!conf->last_event) { + return -1; + } + + conf->subvol_up_time = GF_CALLOC (cnt, sizeof (time_t), + gf_dht_mt_subvol_time); + if (!conf->subvol_up_time) { + return -1; + } + + conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), + gf_dht_mt_dht_du_t); + if (!conf->du_stats) { + return -1; + } + + conf->decommissioned_bricks = GF_CALLOC (cnt, sizeof (xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->decommissioned_bricks) { + return -1; + } + + return 0; +} + + + + +static int +dht_migration_complete_check_done (int op_ret, call_frame_t *frame, void *data) +{ + dht_local_t *local = NULL; + + local = frame->local; + + local->rebalance.target_op_fn (THIS, frame, op_ret); + + return 0; +} + + +int +dht_migration_complete_check_task (void *data) +{ + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + dht_layout_t *layout = NULL; + struct iatt stbuf = {0,}; + xlator_t *this = NULL; + call_frame_t *frame = NULL; + loc_t tmp_loc = {0,}; + char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + + if (!local->loc.inode) { + ret = syncop_fgetxattr (src_node, local->fd, &dict, + conf->link_xattr_name); + } else { + SYNCTASK_SETID (0, 0); + ret = syncop_getxattr (src_node, &local->loc, &dict, + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } + + if (!ret) + dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); + + if (ret) { + if ((errno != ENOENT) || (!local->loc.inode)) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to get the 'linkto' xattr %s", + local->loc.path, strerror (errno)); + goto out; + } + /* Need to do lookup on hashed subvol, then get the file */ + ret = syncop_lookup (this, &local->loc, NULL, &stbuf, NULL, + NULL); + if (ret) + goto out; + dst_node = dht_subvol_get_cached (this, local->loc.inode); + } + + if (!dst_node) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to get the destination node", + local->loc.path); + ret = -1; + goto out; + } + + /* lookup on dst */ + if (local->loc.inode) { + ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL, NULL); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to lookup the file on %s", + local->loc.path, dst_node->name); + goto out; + } + + if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) { + gf_log (this->name, GF_LOG_ERROR, + "%s: gfid different on the target file on %s", + local->loc.path, dst_node->name); + ret = -1; + goto out; + } + } + + /* update inode ctx (the layout) */ + dht_layout_unref (this, local->layout); + + ret = dht_layout_preset (this, dst_node, inode); + if (ret != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s: could not set preset layout for subvol %s", + local->loc.path, dst_node->name); + ret = -1; + goto out; + } + + layout = dht_layout_for_subvol (this, dst_node); + if (!layout) { + gf_log (this->name, GF_LOG_INFO, + "%s: no pre-set layout for subvolume %s", + local->loc.path, dst_node ? dst_node->name : "<nil>"); + ret = -1; + goto out; + } + + ret = dht_layout_set (this, inode, layout); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set the new layout", + local->loc.path); + goto out; + } + + local->cached_subvol = dst_node; + ret = 0; + + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) + goto out; + + if (list_empty (&inode->fd_list)) + goto out; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } + } + GF_FREE (path); + + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + if (open_failed) { + ret = -1; + goto out; + } + ret = 0; +out: + + return ret; +} + +int +dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame) +{ + int ret = -1; + + ret = synctask_new (this->ctx->env, dht_migration_complete_check_task, + dht_migration_complete_check_done, + frame, frame); + return ret; +} + +/* During 'in-progress' state, both nodes should have the file */ +static int +dht_inprogress_check_done (int op_ret, call_frame_t *sync_frame, void *data) +{ + dht_local_t *local = NULL; + + local = sync_frame->local; + + local->rebalance.target_op_fn (THIS, sync_frame, op_ret); + + return 0; +} + +static int +dht_rebalance_inprogress_task (void *data) +{ + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + char *path = NULL; + struct iatt stbuf = {0,}; + loc_t tmp_loc = {0,}; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + if (local->loc.inode) { + SYNCTASK_SETID (0, 0); + ret = syncop_getxattr (src_node, &local->loc, &dict, + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } else { + ret = syncop_fgetxattr (src_node, local->fd, &dict, + conf->link_xattr_name); + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to get the 'linkto' xattr %s", + local->loc.path, strerror (errno)); + goto out; + } + + dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); + if (!dst_node) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to get the 'linkto' xattr from dict", + local->loc.path); + ret = -1; + goto out; + } + + local->rebalance.target_node = dst_node; + + if (local->loc.inode) { + /* lookup on dst */ + ret = syncop_lookup (dst_node, &local->loc, NULL, + &stbuf, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to lookup the file on %s", + local->loc.path, dst_node->name); + goto out; + } + + if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) { + gf_log (this->name, GF_LOG_ERROR, + "%s: gfid different on the target file on %s", + local->loc.path, dst_node->name); + ret = -1; + goto out; + } + } + + ret = 0; + + if (list_empty (&inode->fd_list)) + goto done; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID (0, 0); + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } + } + GF_FREE (path); + + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; + goto out; + } + +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set inode-ctx target file at %s", + local->loc.path, dst_node->name); + goto out; + } + + ret = 0; +out: + return ret; +} + +int +dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame) +{ + + int ret = -1; + + ret = synctask_new (this->ctx->env, dht_rebalance_inprogress_task, + dht_inprogress_check_done, + frame, frame); + return ret; +} + +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + if (!ret && ctx) { + ctx->layout = layout_int; + } else { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return ret; + ctx->layout = layout_int; + } + + ret = dht_inode_ctx_set (inode, this, ctx); + + return ret; +} + +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t post) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO (this->name, stat, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (ret) { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return -1; + } + + time = &ctx->time; + + DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, + stat->ia_mtime, stat->ia_mtime_nsec, inode, post); + DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, + stat->ia_ctime, stat->ia_ctime_nsec, inode, post); + DHT_UPDATE_TIME(time->atime, time->atime_nsec, + stat->ia_atime, stat->ia_atime_nsec, inode, post); + + ret = dht_inode_ctx_set (inode, this, ctx); +out: + return 0; +} + +int +dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = inode_ctx_get (inode, this, &ctx_int); + + if (ret) + return ret; + + if (ctx) + *ctx = (dht_inode_ctx_t *) ctx_int; +out: + return ret; +} + +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; +} diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c new file mode 100644 index 000000000..ece84151a --- /dev/null +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -0,0 +1,1139 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.h" + +int dht_access2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_readv2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_attr2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_open2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_flush2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_lk2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_fsync2 (xlator_t *this, call_frame_t *frame, int ret); + +int +dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = 0; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (!op_ret || (local->call_cnt != 1)) + goto out; + + /* rebalance would have happened */ + local->rebalance.target_op_fn = dht_open2; + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + +out: + DHT_STACK_UNWIND (open, frame, op_ret, op_errno, local->fd, xdata); + + return 0; +} + +int +dht_open2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = ENOENT; + if (op_ret) + goto out; + + local->call_cnt = 2; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, + &local->loc, local->rebalance.flags, local->fd, + NULL); + return 0; + +out: + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, fd_t *fd, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, loc, fd, GF_FOP_OPEN); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local->rebalance.flags = flags; + local->call_cnt = 1; + + STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, + loc, flags, fd, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) +{ + xlator_t *subvol = 0; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + local->op_errno = op_errno; + /* Check if the rebalance phase2 is true */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { + /* Phase 2 of migration */ + local->rebalance.target_op_fn = dht_attr2; + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } else { + /* value is already set in fd_ctx, that means no need + to check for whether its complete or not. */ + dht_attr2 (this, frame, 0); + return 0; + } + } + +out: + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (stat, frame, op_ret, op_errno, stbuf, xdata); +err: + return 0; +} + +int +dht_attr2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + if (op_ret == -1) + goto out; + + subvol = local->cached_subvol; + local->call_cnt = 2; + + if (local->fop == GF_FOP_FSTAT) { + STACK_WIND (frame, dht_file_attr_cbk, subvol, + subvol->fops->fstat, local->fd, NULL); + } else { + STACK_WIND (frame, dht_file_attr_cbk, subvol, + subvol->fops->stat, &local->loc, NULL); + } + return 0; + +out: + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto unlock; + } + + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); +out: + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, + &local->stbuf, xdata); + } +err: + return 0; +} + +int +dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + + local = dht_local_init (frame, loc, NULL, GF_FOP_STAT); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (IA_ISREG (loc->inode->ia_type)) { + local->call_cnt = 1; + + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_file_attr_cbk, subvol, + subvol->fops->stat, loc, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->stat, + loc, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FSTAT); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (IA_ISREG (fd->inode->ia_type)) { + local->call_cnt = 1; + + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_file_attr_cbk, subvol, + subvol->fops->fstat, fd, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->fstat, + fd, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iovec *vector, int count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = 0; + inode_t *inode = NULL; + xlator_t *subvol = 0; + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + /* This is already second try, no need for re-check */ + if (local->call_cnt != 1) + goto out; + + if ((op_ret == -1) && (op_errno != ENOENT)) + goto out; + + local->op_errno = op_errno; + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { + /* File would be migrated to other node */ + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { + local->rebalance.target_op_fn = dht_readv2; + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } else { + /* value is already set in fd_ctx, that means no need + to check for whether its complete or not. */ + dht_readv2 (this, frame, 0); + return 0; + } + } + +out: + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; +} + +int +dht_readv2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + if (op_ret == -1) + goto out; + + local->call_cnt = 2; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_readv_cbk, subvol, subvol->fops->readv, + local->fd, local->rebalance.size, local->rebalance.offset, + local->rebalance.flags, NULL); + + return 0; + +out: + DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + return 0; +} + +int +dht_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, uint32_t flags, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_READ); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local->rebalance.offset = off; + local->rebalance.size = size; + local->rebalance.flags = flags; + local->call_cnt = 1; + + STACK_WIND (frame, dht_readv_cbk, + subvol, subvol->fops->readv, + fd, size, off, flags, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + + return 0; +} + +int +dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + int ret = -1; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (!prev || !prev->this) + goto out; + if (local->call_cnt != 1) + goto out; + if ((op_ret == -1) && (op_errno == ENOTCONN) && + IA_ISDIR(local->loc.inode->ia_type)) { + + subvol = dht_subvol_next_available (this, prev->this); + if (!subvol) + goto out; + + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, + &local->loc, local->rebalance.flags, NULL); + return 0; + } + if ((op_ret == -1) && (op_errno == ENOENT)) { + /* File would be migrated to other node */ + local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_access2; + ret = dht_rebalance_complete_check (frame->this, frame); + if (!ret) + return 0; + } + +out: + DHT_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); + return 0; +} + +int +dht_access2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + if (op_ret == -1) + goto out; + + local->call_cnt = 2; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, + &local->loc, local->rebalance.flags, NULL); + + return 0; + +out: + DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); + return 0; +} + + +int +dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_ACCESS); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mask; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, + loc, mask, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *subvol = 0; + + local = frame->local; + + local->op_errno = op_errno; + + if (local->call_cnt != 1) + goto out; + + /* If context is set, then send flush() it to the destination */ + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { + dht_flush2 (this, frame, 0); + return 0; + } + +out: + DHT_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + + return 0; +} + +int +dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND (frame, dht_flush_cbk, + subvol, subvol->fops->flush, local->fd, NULL); + + return 0; +} + + +int +dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FLUSH); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_flush_cbk, + subvol, subvol->fops->flush, fd, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (flush, frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + inode_t *inode = NULL; + xlator_t *subvol = 0; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + if (op_ret == -1 && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + + local->op_errno = op_errno; + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { + local->rebalance.target_op_fn = dht_fsync2; + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + + ret = dht_rebalance_in_progress_check (this, frame); + } + + /* Check if the rebalance phase2 is true */ + if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + } + if (!ret) + return 0; + } else { + dht_fsync2 (this, frame, 0); + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (fsync, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + + return 0; +} + +int +dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, + local->fd, local->rebalance.flags, NULL); + + return 0; +} + +int +dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FSYNC); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + local->call_cnt = 1; + local->rebalance.flags = datasync; + + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, + fd, datasync, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to + indicate that lock migration happened on the fd, so we can consider it as + phase 2 of migration */ +int +dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct gf_flock *flock, dict_t *xdata) +{ + DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock, xdata); + + return 0; +} + + +int +dht_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int cmd, struct gf_flock *flock, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + /* TODO: for rebalance, we need to preserve the fop arguments */ + STACK_WIND (frame, dht_lk_cbk, subvol, subvol->fops->lk, fd, + cmd, flock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +/* Symlinks are currently not migrated, so no need for any check here */ +int +dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, const char *path, + struct iatt *stbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + if (op_ret == -1) + goto err; + + if (!local) { + op_ret = -1; + op_errno = EINVAL; + } + +err: + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, stbuf, xdata); + + return 0; +} + + +int +dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_READLINK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_readlink_cbk, + subvol, subvol->fops->readlink, + loc, size, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +/* Currently no translators on top of 'distribute' will be using + * below fops, hence not implementing 'migration' related checks + */ + +int +dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_XATTROP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, + dht_xattrop_cbk, + subvol, subvol->fops->xattrop, + loc, flags, dict, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +dht_fxattrop (call_frame_t *frame, xlator_t *this, + fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + dht_fxattrop_cbk, + subvol, subvol->fops->fxattrop, + fd, flags, dict, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int32_t +dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_INODELK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, + dht_inodelk_cbk, + subvol, subvol->fops->inodelk, + volume, loc, cmd, lock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + + STACK_WIND (frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk, + volume, fd, cmd, lock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c new file mode 100644 index 000000000..4b3f3a049 --- /dev/null +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -0,0 +1,1013 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.h" + +int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); + +int +dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + if (op_ret == -1 && (op_errno != ENOENT)) { + goto out; + } + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + if (local->call_cnt != 1) { + /* preserve the modes of source */ + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + + local->rebalance.target_op_fn = dht_writev2; + + local->op_errno = op_errno; + /* Phase 2 of migration */ + if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + + ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_writev2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + + DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int +dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND (frame, dht_writev_cbk, + subvol, subvol->fops->writev, + local->fd, local->rebalance.vector, local->rebalance.count, + local->rebalance.offset, local->rebalance.flags, + local->rebalance.iobref, NULL); + + return 0; +} + +int +dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE); + if (!local) { + + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + + local->rebalance.vector = iov_dup (vector, count); + local->rebalance.offset = off; + local->rebalance.count = count; + local->rebalance.flags = flags; + local->rebalance.iobref = iobref_ref (iobref); + local->call_cnt = 1; + + STACK_WIND (frame, dht_writev_cbk, + subvol, subvol->fops->writev, + fd, vector, count, off, flags, iobref, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + +int +dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + + local->rebalance.target_op_fn = dht_truncate2; + + local->op_errno = op_errno; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + inode = (local->fd) ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { + dht_truncate2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (truncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + + +int +dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + inode_t *inode = NULL; + + local = frame->local; + + inode = local->fd ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + if (local->fop == GF_FOP_TRUNCATE) { + STACK_WIND (frame, dht_truncate_cbk, subvol, + subvol->fops->truncate, &local->loc, + local->rebalance.offset, NULL); + } else { + STACK_WIND (frame, dht_truncate_cbk, subvol, + subvol->fops->ftruncate, local->fd, + local->rebalance.offset, NULL); + } + + return 0; +} + +int +dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_TRUNCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_truncate_cbk, + subvol, subvol->fops->truncate, + loc, offset, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FTRUNCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_truncate_cbk, + subvol, subvol->fops->ftruncate, + fd, offset, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_fallocate2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_fallocate2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate, + local->fd, local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, NULL); + + return 0; +} + +int +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mode; + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fallocate_cbk, + subvol, subvol->fops->fallocate, + fd, mode, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_discard2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_discard2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (discard, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_zerofill2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = fd_ctx_get (local->fd, this, NULL); + if (!ret) { + dht_zerofill2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + uint64_t tmp_subvol = 0; + int ret = -1; + + local = frame->local; + + if (local->fd) + ret = fd_ctx_get (local->fd, this, &tmp_subvol); + if (!ret) + subvol = (xlator_t *)(long)tmp_subvol; + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + +/* handle cases of migration here for 'setattr()' calls */ +int +dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + local->rebalance.target_op_fn = dht_setattr2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* At the end of the migration process, whatever 'attr' we + have on source file will be migrated to destination file + in one shot, hence we don't need to check for in progress + state here (ie, PHASE1) */ +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (setattr, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + + return 0; +} + +int +dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + inode_t *inode = NULL; + + local = frame->local; + + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + if (local->fop == GF_FOP_SETATTR) { + STACK_WIND (frame, dht_file_setattr_cbk, subvol, + subvol->fops->setattr, &local->loc, + &local->rebalance.stbuf, local->rebalance.flags, + NULL); + } else { + STACK_WIND (frame, dht_file_setattr_cbk, subvol, + subvol->fops->fsetattr, local->fd, + &local->rebalance.stbuf, local->rebalance.flags, + NULL); + } + + return 0; +} + + +/* Keep the existing code same for all the cases other than regular file */ +int +dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + dht_iatt_merge (this, &local->prebuf, statpre, prev->this); + dht_iatt_merge (this, &local->stbuf, statpost, prev->this); + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->stbuf, xdata); + + return 0; +} + + +int +dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + int call_cnt = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_SETATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_DEBUG, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (IA_ISREG (loc->inode->ia_type)) { + /* in the regular file _cbk(), we need to check for + migration possibilities */ + local->rebalance.stbuf = *stbuf; + local->rebalance.flags = valid; + local->call_cnt = 1; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_file_setattr_cbk, subvol, + subvol->fops->setattr, + loc, stbuf, valid, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_setattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->setattr, + loc, stbuf, valid, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FSETATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_DEBUG, + "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (IA_ISREG (fd->inode->ia_type)) { + /* in the regular file _cbk(), we need to check for + migration possibilities */ + local->rebalance.stbuf = *stbuf; + local->rebalance.flags = valid; + local->call_cnt = 1; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_file_setattr_cbk, subvol, + subvol->fops->fsetattr, + fd, stbuf, valid, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_setattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fsetattr, + fd, stbuf, valid, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 16767adb9..38e9970a7 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -52,8 +43,11 @@ dht_layout_new (xlator_t *this, int cnt) layout->type = DHT_HASH_TYPE_DM; layout->cnt = cnt; - if (conf) + + if (conf) { + layout->spread_cnt = conf->dir_spread_cnt; layout->gen = conf->gen; + } layout->ref = 1; out: @@ -65,9 +59,7 @@ dht_layout_t * dht_layout_get (xlator_t *this, inode_t *inode) { dht_conf_t *conf = NULL; - uint64_t layout_int = 0; dht_layout_t *layout = NULL; - int ret = -1; conf = this->private; if (!conf) @@ -75,9 +67,8 @@ dht_layout_get (xlator_t *this, inode_t *inode) LOCK (&conf->layout_lock); { - ret = inode_ctx_get (inode, this, &layout_int); - if (ret == 0) { - layout = (dht_layout_t *) (unsigned long) layout_int; + dht_inode_ctx_layout_get (inode, this, &layout); + if (layout) { layout->ref++; } } @@ -95,7 +86,6 @@ dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) int oldret = -1; int ret = 0; dht_layout_t *old_layout; - uint64_t old_layout_int; conf = this->private; if (!conf) @@ -103,16 +93,13 @@ dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) LOCK (&conf->layout_lock); { - oldret = inode_ctx_get (inode, this, &old_layout_int); - + oldret = dht_inode_ctx_layout_get (inode, this, &old_layout); layout->ref++; - ret = inode_ctx_put (inode, this, (uint64_t) (unsigned long) - layout); + dht_inode_ctx_layout_set (inode, this, layout); } UNLOCK (&conf->layout_lock); - if (oldret == 0) { - old_layout = (dht_layout_t *) (unsigned long) old_layout_int; + if (!oldret) { dht_layout_unref (this, old_layout); } @@ -127,7 +114,7 @@ dht_layout_unref (xlator_t *this, dht_layout_t *layout) dht_conf_t *conf = NULL; int ref = 0; - if (layout->preset || !this->private) + if (!layout || layout->preset || !this->private) return; conf = this->private; @@ -171,9 +158,9 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) int ret = 0; - ret = dht_hash_compute (layout->type, name, &hash); + ret = dht_hash_compute (this, layout->type, name, &hash); if (ret != 0) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "hash computation failed for type=%d name=%s", layout->type, name); goto out; @@ -188,7 +175,7 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) } if (!subvol) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "no subvolume for hash (value) = %u", hash); } @@ -277,6 +264,9 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, if (disk_layout_p) *disk_layout_p = disk_layout; + else + GF_FREE (disk_layout); + ret = 0; out: @@ -286,7 +276,7 @@ out: int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw) + int pos, void *disk_layout_raw, int disk_layout_len) { int cnt = 0; int type = 0; @@ -294,19 +284,38 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, int stop_off = 0; int disk_layout[4]; - /* TODO: assert disk_layout_ptr is of required length */ + if (!disk_layout_raw) { + gf_log (this->name, GF_LOG_CRITICAL, + "error no layout on disk for merge"); + return -1; + } - memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); + GF_ASSERT (disk_layout_len == sizeof (disk_layout)); + + memcpy (disk_layout, disk_layout_raw, disk_layout_len); cnt = ntoh32 (disk_layout[0]); if (cnt != 1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_ERROR, "disk layout has invalid count %d", cnt); return -1; } - /* TODO: assert type is compatible */ - type = ntoh32 (disk_layout[1]); + type = ntoh32 (disk_layout[1]); + switch (type) { + case DHT_HASH_TYPE_DM_USER: + gf_log (this->name, GF_LOG_DEBUG, "found user-set layout"); + layout->type = type; + /* Fall through. */ + case DHT_HASH_TYPE_DM: + break; + default: + gf_log (this->name, GF_LOG_CRITICAL, + "Catastrophic error layout with unknown type found %d", + disk_layout[1]); + return -1; + } + start_off = ntoh32 (disk_layout[2]); stop_off = ntoh32 (disk_layout[3]); @@ -326,11 +335,12 @@ int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int op_ret, int op_errno, dict_t *xattr) { - int i = 0; - int ret = -1; - int err = -1; - void *disk_layout_raw = NULL; - + int i = 0; + int ret = -1; + int err = -1; + void *disk_layout_raw = NULL; + int disk_layout_len = 0; + dht_conf_t *conf = this->private; if (op_ret != 0) { err = op_errno; @@ -351,12 +361,12 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, if (xattr) { /* during lookup and not mkdir */ - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", - &disk_layout_raw); + ret = dict_get_ptr_and_len (xattr, conf->xattr_name, + &disk_layout_raw, &disk_layout_len); } if (ret != 0) { - layout->list[i].err = -1; + layout->list[i].err = 0; gf_log (this->name, GF_LOG_TRACE, "missing disk layout on %s. err = %d", subvol->name, err); @@ -364,9 +374,10 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, goto out; } - ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw); + ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw, + disk_layout_len); if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "layout merge from subvolume %s failed", subvol->name); goto out; @@ -402,6 +413,22 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) layout->list[j].err = err_swap; } +void +dht_layout_range_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; +} + int64_t dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) { @@ -409,17 +436,37 @@ dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) layout->list[j].xlator->name)); } + +gf_boolean_t +dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator) +{ + int i = 0; + + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, xlator->name)) + return _gf_true; + } + return _gf_false; +} + int64_t dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) { int64_t diff = 0; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t) layout->list[i].stop + - (int64_t) layout->list[j].stop; + goto out; + } if (layout->list[i].err || layout->list[j].err) diff = layout->list[i].err - layout->list[j].err; else diff = (int64_t) layout->list[i].start - (int64_t) layout->list[j].start; +out: return diff; } @@ -468,7 +515,8 @@ dht_layout_sort_volname (dht_layout_t *layout) int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p) + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, + uint32_t *no_space_p) { uint32_t overlaps = 0; uint32_t missing = 0; @@ -481,30 +529,38 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, uint32_t prev_stop = 0; uint32_t last_stop = 0; char is_virgin = 1; + uint32_t no_space = 0; - /* TODO: explain WTF is happening */ + /* TODO: explain what is happening */ last_stop = layout->list[0].start - 1; prev_stop = last_stop; for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err) { - switch (layout->list[i].err) { - case -1: - case ENOENT: - missing++; - break; - case ENOTCONN: - down++; - break; - case ENOSPC: - down++; - break; - default: - misc++; + switch (layout->list[i].err) { + case -1: + case ENOENT: + missing++; + continue; + case ENOTCONN: + down++; + continue; + case ENOSPC: + no_space++; + continue; + case 0: + /* if err == 0 and start == stop, then it is a non misc++; + * participating subvolume(spread-cnt). Then, do not + * check for anomalies. If start != stop, then treat it + * as misc err */ + if (layout->list[i].start == layout->list[i].stop) { + continue; } + break; + default: + misc++; continue; - } + } is_virgin = 0; @@ -537,6 +593,9 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, if (misc_p) *misc_p = misc; + if (no_space_p) + *no_space_p = no_space; + return ret; } @@ -552,7 +611,6 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) uint32_t down = 0; uint32_t misc = 0; - ret = dht_layout_sort (layout); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, @@ -562,7 +620,7 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) ret = dht_layout_anomalies (this, loc, layout, &holes, &overlaps, - &missing, &down, &misc); + &missing, &down, &misc, NULL); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "error while finding anomalies in %s -- not good news", @@ -580,43 +638,56 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) "found anomalies in %s. holes=%d overlaps=%d", loc->path, holes, overlaps); } - ret = 1; + ret = -1; } for (i = 0; i < layout->cnt; i++) { - /* TODO During DHT selfheal rewrite (almost) find a better place to - * detect this - probably in dht_layout_anomalies() + /* TODO During DHT selfheal rewrite (almost) find a better place + * to detect this - probably in dht_layout_anomalies() */ if (layout->list[i].err > 0) { - gf_log (this->name, GF_LOG_DEBUG, - "path=%s err=%s on subvol=%s", - loc->path, strerror (layout->list[i].err), - (layout->list[i].xlator ? - layout->list[i].xlator->name : "<>")); - if (layout->list[i].err == ENOENT) - ret = 1; + gf_log_callingfn (this->name, GF_LOG_DEBUG, + "path=%s err=%s on subvol=%s", + loc->path, + strerror (layout->list[i].err), + (layout->list[i].xlator ? + layout->list[i].xlator->name + : "<>")); + if ((layout->list[i].err == ENOENT) && (ret >= 0)) { + ret++; + } } } + out: return ret; } +int +dht_dir_has_layout (dict_t *xattr, char *name) +{ + + void *disk_layout_raw = NULL; + + return dict_get_ptr (xattr, name, &disk_layout_raw); +} int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, loc_t *loc, dict_t *xattr) { - int idx = 0; - int pos = -1; - int ret = 0; - int err = 0; - int dict_ret = 0; - int32_t disk_layout[4]; - void *disk_layout_raw = NULL; - int32_t count = -1; - uint32_t start_off = -1; - uint32_t stop_off = -1; + int idx = 0; + int pos = -1; + int ret = 0; + int err = 0; + int dict_ret = 0; + int32_t disk_layout[4]; + void *disk_layout_raw = NULL; + int32_t count = -1; + uint32_t start_off = -1; + uint32_t stop_off = -1; + dht_conf_t *conf = this->private; for (idx = 0; idx < layout->cnt; idx++) { @@ -646,7 +717,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, goto out; } - dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + dict_ret = dict_get_ptr (xattr, conf->xattr_name, &disk_layout_raw); if (dict_ret < 0) { @@ -662,7 +733,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, count = ntoh32 (disk_layout[0]); if (count != 1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_ERROR, "%s - disk layout has invalid count %d", loc->path, count); ret = -1; @@ -711,7 +782,7 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode) LOCK (&conf->layout_lock); { - inode_ctx_put (inode, this, (uint64_t)(long)layout); + dht_inode_ctx_layout_set (inode, this, layout); } UNLOCK (&conf->layout_lock); diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index 9dd487bc8..dbc9d0b3c 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -28,127 +19,154 @@ #include "compat.h" #include "dht-common.h" - - int -dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) +dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - dht_local_t *local = NULL; + char is_linkfile = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; local = frame->local; - local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - local->linkfile.inode, - &local->linkfile.stbuf, NULL, NULL); + prev = cookie; + conf = this->private; + + if (op_ret) + goto out; + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + if (!is_linkfile) + gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s", + prev->this->name, local->loc.path); +out: + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + inode, stbuf, postparent, postparent, + xattr); return 0; } - +#define is_equal(a, b) (a == b) int dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; + xlator_t *subvol = NULL; call_frame_t *prev = NULL; - dict_t *xattr = NULL; - data_t *str_data = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = NULL; int ret = -1; local = frame->local; - prev = cookie; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to create link file (%s)", - local->linkfile.loc.path, strerror (op_errno)); - goto err; - } + if (!op_ret) + local->linked = _gf_true; - xattr = get_new_dict (); - if (!xattr) { - op_errno = ENOMEM; - goto err; - } + FRAME_SU_UNDO (frame, dht_local_t); - local->linkfile.xattr = dict_ref (xattr); - local->linkfile.inode = inode_ref (inode); - - str_data = str_to_data (local->linkfile.srcvol->name); - if (!str_data) { - op_errno = ENOMEM; - goto err; - } - - ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to initialize linkfile data", - local->linkfile.loc.path); - } - str_data = NULL; - - local->linkfile.stbuf = *stbuf; - - STACK_WIND (frame, dht_linkfile_xattr_cbk, - prev->this, prev->this->fops->setxattr, - &local->linkfile.loc, local->linkfile.xattr, 0); - - return 0; + if (op_ret && (op_errno == EEXIST)) { + conf = this->private; + prev = cookie; + subvol = prev->this; + if (!subvol) + goto out; + xattrs = dict_new (); + if (!xattrs) + goto out; + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set linkto key"); + goto out; + } -err: - if (str_data) { - data_destroy (str_data); - str_data = NULL; + STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol, + subvol->fops->lookup, &local->loc, xattrs); + if (xattrs) + dict_unref (xattrs); + return 0; } - +out: local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, + xdata); + if (xattrs) + dict_unref (xattrs); return 0; } int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, xlator_t *fromvol, loc_t *loc) { dht_local_t *local = NULL; dict_t *dict = NULL; + int need_unref = 0; int ret = 0; + dht_conf_t *conf = this->private; local = frame->local; local->linkfile.linkfile_cbk = linkfile_cbk; local->linkfile.srcvol = tovol; - loc_copy (&local->linkfile.loc, loc); - if (!uuid_is_null (local->gfid)) { + local->linked = _gf_false; + + dict = local->params; + if (!dict) { dict = dict_new (); if (!dict) goto out; + need_unref = 1; + } + + if (!uuid_is_null (local->gfid)) { ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); if (ret) gf_log ("dht-linkfile", GF_LOG_INFO, "%s: gfid set failed", loc->path); - } else if (local->params) { - dict = dict_ref (local->params); } - if (!dict) + + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log ("dht-linkfile", GF_LOG_INFO, + "%s: internal-fop set failed", loc->path); + + ret = dict_set_str (dict, conf->link_xattr_name, tovol->name); + + if (ret < 0) { gf_log (frame->this->name, GF_LOG_INFO, - "dict is NULL, need to make sure gfid's are same"); + "%s: failed to initialize linkfile data", + loc->path); + goto out; + } + local->link_subvol = fromvol; + /* Always create as root:root. dht_linkfile_attr_heal fixes the + * ownsership */ + FRAME_SU_DO (frame, dht_local_t); STACK_WIND (frame, dht_linkfile_create_cbk, fromvol, fromvol->fops->mknod, loc, - S_IFREG | DHT_LINKFILE_MODE, 0, dict); + S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict); - if (dict) + if (need_unref && dict) dict_unref (dict); return 0; out: local->linkfile.linkfile_cbk (frame, NULL, frame->this, -1, ENOMEM, - loc->inode, NULL, NULL, NULL); + loc->inode, NULL, NULL, NULL, NULL); + + if (need_unref && dict) + dict_unref (dict); + return 0; } @@ -156,7 +174,8 @@ out: int dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -190,16 +209,17 @@ dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, goto err; } - unlink_local = dht_local_init (unlink_frame); + /* Using non-fop value here, as anyways, 'local->fop' is not used in + this particular case */ + unlink_local = dht_local_init (unlink_frame, loc, NULL, + GF_FOP_MAXVALUE); if (!unlink_local) { goto err; } - loc_copy (&unlink_local->loc, loc); - STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, subvol, subvol->fops->unlink, - &unlink_local->loc); + &unlink_local->loc, 0, NULL); return 0; err: @@ -224,7 +244,7 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, if (!xattr) goto out; - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname); if ((-1 == ret) || !volname) goto out; @@ -239,3 +259,70 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, out: return subvol; } + +int +dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + loc_t *loc = NULL; + + local = frame->local; + loc = &local->loc; + + if (op_ret) + gf_log (this->name, GF_LOG_ERROR, "setattr of uid/gid on %s" + " :<gfid:%s> failed (%s)", + (loc->path? loc->path: "NULL"), + uuid_utoa(local->gfid), strerror(op_errno)); + + DHT_STACK_DESTROY (frame); + + return 0; +} + +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this) +{ + int ret = -1; + call_frame_t *copy = NULL; + dht_local_t *local = NULL; + dht_local_t *copy_local = NULL; + xlator_t *subvol = NULL; + struct iatt stbuf = {0,}; + + local = frame->local; + + GF_VALIDATE_OR_GOTO ("dht", local, out); + GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out); + + if (local->stbuf.ia_type == IA_INVAL) + return 0; + + uuid_copy (local->loc.gfid, local->stbuf.ia_gfid); + + copy = copy_frame (frame); + + if (!copy) + goto out; + + copy_local = dht_local_init (copy, &local->loc, NULL, 0); + + if (!copy_local) + goto out; + + stbuf = local->stbuf; + subvol = local->link_subvol; + + copy->local = copy_local; + + FRAME_SU_DO (copy, dht_local_t); + + STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol, + subvol->fops->setattr, ©_local->loc, + &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); + ret = 0; +out: + return ret; +} diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index af31c8b07..e893eb48f 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -28,13 +19,17 @@ enum gf_dht_mem_types_ { gf_dht_mt_dht_conf_t, gf_dht_mt_char, gf_dht_mt_int32_t, - gf_dht_mt_dht_local_t, gf_dht_mt_xlator_t, gf_dht_mt_dht_layout_t, gf_switch_mt_dht_conf_t, gf_switch_mt_dht_du_t, gf_switch_mt_switch_sched_array, gf_switch_mt_switch_struct, + gf_dht_mt_subvol_time, + gf_dht_mt_loc_t, + gf_defrag_info_mt, + gf_dht_mt_inode_ctx_t, + gf_dht_mt_ctx_stat_time_t, gf_dht_mt_end }; #endif diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c new file mode 100644 index 000000000..bcb19f23e --- /dev/null +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -0,0 +1,1815 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.h" +#include "xlator.h" +#include <fnmatch.h> + +#define GF_DISK_SECTOR_SIZE 512 +#define DHT_REBALANCE_PID 4242 /* Change it if required */ +#define DHT_REBALANCE_BLKSIZE (128 * 1024) + +static int +dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, + int32_t size, off_t offset, struct iobref *iobref) +{ + int i = 0; + int ret = -1; + int start_idx = 0; + int tmp_offset = 0; + int write_needed = 0; + int buf_len = 0; + int size_pending = 0; + char *buf = NULL; + + /* loop through each vector */ + for (i = 0; i < count; i++) { + buf = vec[i].iov_base; + buf_len = vec[i].iov_len; + + for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len; + start_idx += GF_DISK_SECTOR_SIZE) { + + if (mem_0filled (buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) { + write_needed = 1; + continue; + } + + if (write_needed) { + ret = syncop_write (to, fd, (buf + tmp_offset), + (start_idx - tmp_offset), + (offset + tmp_offset), + iobref, 0); + /* 'path' will be logged in calling function */ + if (ret < 0) { + gf_log (THIS->name, GF_LOG_WARNING, + "failed to write (%s)", + strerror (errno)); + goto out; + } + + write_needed = 0; + } + tmp_offset = start_idx + GF_DISK_SECTOR_SIZE; + } + + if ((start_idx < buf_len) || write_needed) { + /* This means, last chunk is not yet written.. write it */ + ret = syncop_write (to, fd, (buf + tmp_offset), + (buf_len - tmp_offset), + (offset + tmp_offset), iobref, 0); + if (ret < 0) { + /* 'path' will be logged in calling function */ + gf_log (THIS->name, GF_LOG_WARNING, + "failed to write (%s)", + strerror (errno)); + goto out; + } + } + + size_pending = (size - buf_len); + if (!size_pending) + break; + } + + ret = size; +out: + return ret; + +} + +int32_t +gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, + struct iatt *stbuf) +{ + int32_t ret = -1; + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *linkto_subvol = NULL; + data_t *data = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("defrag", loc, out); + GF_VALIDATE_OR_GOTO ("defrag", loc->name, out); + GF_VALIDATE_OR_GOTO ("defrag", stbuf, out); + GF_VALIDATE_OR_GOTO ("defrag", this, out); + GF_VALIDATE_OR_GOTO ("defrag", xattrs, out); + GF_VALIDATE_OR_GOTO ("defrag", this->private, out); + + conf = this->private; + + if (uuid_is_null (loc->pargfid)) { + gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for " + "%s", loc->path); + goto out; + } + + if (uuid_is_null (loc->gfid)) { + gf_log ("", GF_LOG_ERROR, "loc->gfid is NULL for " + "%s", loc->path); + goto out; + } + + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get cached subvol" + " for %s on %s", loc->name, this->name); + goto out; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get hashed subvol" + " for %s on %s", loc->name, this->name); + goto out; + } + + gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s " + "with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid), + cached_subvol->name, hashed_subvol->name); + data = dict_get (xattrs, conf->link_xattr_name); + /* set linkto on cached -> hashed if not present, else link it */ + if (!data) { + ret = dict_set_str (xattrs, conf->link_xattr_name, + hashed_subvol->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "linkto xattr in dict for %s", loc->name); + goto out; + } + + ret = syncop_setxattr (cached_subvol, loc, xattrs, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Linkto setxattr " + "failed %s -> %s (%s)", cached_subvol->name, + loc->name, strerror (errno)); + goto out; + } + goto out; + } else { + linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs); + if (!linkto_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "linkto subvol for %s", loc->name); + } else { + hashed_subvol = linkto_subvol; + } + + ret = syncop_link (hashed_subvol, loc, loc); + if (ret) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "link of %s -> %s" + " failed on subvol %s (%s)", loc->name, + uuid_utoa(loc->gfid), + hashed_subvol->name, strerror (op_errno)); + if (op_errno != EEXIST) + goto out; + } + } + ret = syncop_lookup (hashed_subvol, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed lookup %s on %s (%s)" + , loc->name, hashed_subvol->name, strerror (errno)); + goto out; + } + + if (iatt.ia_nlink == stbuf->ia_nlink) { + ret = dht_migrate_file (this, loc, cached_subvol, hashed_subvol, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + + +static inline int +__is_file_migratable (xlator_t *this, loc_t *loc, + struct iatt *stbuf, dict_t *xattrs, int flags) +{ + int ret = -1; + + if (IA_ISDIR (stbuf->ia_type)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: migrate-file called on directory", loc->path); + ret = -1; + goto out; + } + + if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) { + ret = 0; + goto out; + } + if (stbuf->ia_nlink > 1) { + /* support for decomission */ + if (flags == GF_DHT_MIGRATE_HARDLINK) { + ret = gf_defrag_handle_hardlink (this, loc, + xattrs, stbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to migrate file with link", + loc->path); + } + } else { + gf_log (this->name, GF_LOG_WARNING, + "%s: file has hardlinks", loc->path); + } + ret = ENOTSUP; + goto out; + } + + ret = 0; + +out: + return ret; +} + +static inline int +__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, + dict_t *dict, fd_t **dst_fd, dict_t *xattr) +{ + xlator_t *this = NULL; + int ret = -1; + fd_t *fd = NULL; + struct iatt new_stbuf = {0,}; + dht_conf_t *conf = NULL; + + this = THIS; + conf = this->private; + + ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + ret = dict_set_str (dict, conf->link_xattr_name, from->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + fd = fd_create (loc->inode, DHT_REBALANCE_PID); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, + "%s: fd create failed (destination) (%s)", + loc->path, strerror (errno)); + ret = -1; + goto out; + } + + ret = syncop_lookup (to, loc, NULL, &new_stbuf, NULL, NULL); + if (!ret) { + /* File exits in the destination, check if gfid matches */ + if (uuid_compare (stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "file %s exits in %s with different gfid", + loc->path, to->name); + fd_unref (fd); + goto out; + } + } + if ((ret == -1) && (errno != ENOENT)) { + /* File exists in destination, but not accessible */ + gf_log (THIS->name, GF_LOG_WARNING, + "%s: failed to lookup file (%s)", + loc->path, strerror (errno)); + goto out; + } + + /* Create the destination with LINKFILE mode, and linkto xattr, + if the linkfile already exists, it will just open the file */ + ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, + dict, &new_stbuf); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create %s on %s (%s)", + loc->path, to->name, strerror (errno)); + goto out; + } + + ret = syncop_fsetxattr (to, fd, xattr, 0); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set xattr on %s (%s)", + loc->path, to->name, strerror (errno)); + + ret = syncop_ftruncate (to, fd, stbuf->ia_size); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate failed for %s on %s (%s)", + loc->path, to->name, strerror (errno)); + + ret = syncop_fsetattr (to, fd, stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "chown failed for %s on %s (%s)", + loc->path, to->name, strerror (errno)); + + if (dst_fd) + *dst_fd = fd; + + /* success */ + ret = 0; + +out: + return ret; +} + +static inline int +__dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, + struct iatt *stbuf, int flag) +{ + struct statvfs src_statfs = {0,}; + struct statvfs dst_statfs = {0,}; + int ret = -1; + xlator_t *this = NULL; + + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + + this = THIS; + + ret = syncop_statfs (from, loc, &src_statfs); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get statfs of %s on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + ret = syncop_statfs (to, loc, &dst_statfs); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get statfs of %s on %s (%s)", + loc->path, to->name, strerror (errno)); + goto out; + } + + /* if force option is given, do not check for space @ dst. + * Check only if space is avail for the file */ + if (flag != GF_DHT_MIGRATE_DATA) + goto check_avail_space; + + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid errorneous move to destination where + the space could be scantily available. + */ + if (stbuf) { + dst_statfs_blocks = ((dst_statfs.f_bavail * + dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + src_statfs_blocks = ((src_statfs.f_bavail * + src_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + if ((dst_statfs_blocks - stbuf->ia_blocks) < + (src_statfs_blocks + stbuf->ia_blocks)) { + gf_log (this->name, GF_LOG_WARNING, + "data movement attempted from node (%s) with" + " higher disk space to a node (%s) with " + "lesser disk space (%s)", from->name, + to->name, loc->path); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + ret = 1; + goto out; + } + } +check_avail_space: + if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { + gf_log (this->name, GF_LOG_ERROR, + "data movement attempted from node (%s) with " + "to node (%s) which does not have required free space" + " for %s", from->name, to->name, loc->path); + ret = 1; + goto out; + } + + ret = 0; +out: + return ret; +} + +static inline int +__dht_rebalance_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, + uint64_t ia_size, int hole_exists) +{ + int ret = 0; + int count = 0; + off_t offset = 0; + struct iovec *vector = NULL; + struct iobref *iobref = NULL; + uint64_t total = 0; + size_t read_size = 0; + + /* if file size is '0', no need to enter this loop */ + while (total < ia_size) { + read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ? + DHT_REBALANCE_BLKSIZE : (ia_size - total)); + ret = syncop_readv (from, src, read_size, + offset, 0, &vector, &count, &iobref); + if (!ret || (ret < 0)) { + break; + } + + if (hole_exists) + ret = dht_write_with_holes (to, dst, vector, count, + ret, offset, iobref); + else + ret = syncop_writev (to, dst, vector, count, + offset, iobref, 0); + if (ret < 0) { + break; + } + offset += ret; + total += ret; + + GF_FREE (vector); + if (iobref) + iobref_unref (iobref); + iobref = NULL; + vector = NULL; + } + if (iobref) + iobref_unref (iobref); + GF_FREE (vector); + + if (ret >= 0) + ret = 0; + + return ret; +} + + +static inline int +__dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, + struct iatt *stbuf, fd_t **src_fd) +{ + int ret = 0; + fd_t *fd = NULL; + dict_t *dict = NULL; + xlator_t *this = NULL; + struct iatt iatt = {0,}; + dht_conf_t *conf = NULL; + + this = THIS; + conf = this->private; + + fd = fd_create (loc->inode, DHT_REBALANCE_PID); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, + "%s: fd create failed (source)", loc->path); + ret = -1; + goto out; + } + + ret = syncop_open (from, loc, O_RDWR, fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "failed to open file %s on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + ret = -1; + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_set_str (dict, conf->link_xattr_name, to->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set xattr in dict for %s (linkto:%s)", + loc->path, to->name); + goto out; + } + + /* Once the migration starts, the source should have 'linkto' key set + to show which is the target, so other clients can work around it */ + ret = syncop_setxattr (from, loc, dict, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set xattr on %s in %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + /* mode should be (+S+T) to indicate migration is in progress */ + iatt.ia_prot = stbuf->ia_prot; + iatt.ia_type = stbuf->ia_type; + iatt.ia_prot.sticky = 1; + iatt.ia_prot.sgid = 1; + + ret = syncop_setattr (from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set mode on %s in %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + if (src_fd) + *src_fd = fd; + + /* success */ + ret = 0; +out: + if (dict) + dict_unref (dict); + + return ret; +} + +int +migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, + struct iatt *buf) +{ + int ret = -1; + dict_t *rsp_dict = NULL; + dict_t *dict = NULL; + char *link = NULL; + struct iatt stbuf = {0,}; + dht_conf_t *conf = this->private; + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } + + /* check in the destination if the file is link file */ + ret = syncop_lookup (to, loc, dict, &stbuf, &rsp_dict, NULL); + if ((ret == -1) && (errno != ENOENT)) { + gf_log (this->name, GF_LOG_WARNING, "%s: lookup failed (%s)", + loc->path, strerror (errno)); + goto out; + } + + /* we no more require this key */ + dict_del (dict, conf->link_xattr_name); + + /* file exists in target node, only if it is 'linkfile' its valid, + otherwise, error out */ + if (!ret) { + if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict, + conf->link_xattr_name)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: file exists in destination", loc->path); + ret = -1; + goto out; + } + + /* as file is linkfile, delete it */ + ret = syncop_unlink (to, loc); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to delete the linkfile (%s)", + loc->path, strerror (errno)); + goto out; + } + } + + /* Set the gfid of the source file in dict */ + ret = dict_set_static_bin (dict, "gfid-req", buf->ia_gfid, 16); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + /* Create the file in target */ + if (IA_ISLNK (buf->ia_type)) { + /* Handle symlinks separately */ + ret = syncop_readlink (from, loc, &link, buf->ia_size); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: readlink on symlink failed (%s)", + loc->path, strerror (errno)); + goto out; + } + + ret = syncop_symlink (to, loc, link, dict, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: creating symlink failed (%s)", + loc->path, strerror (errno)); + goto out; + } + + goto done; + } + + ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot, + buf->ia_type), + makedev (ia_major (buf->ia_rdev), + ia_minor (buf->ia_rdev)), dict, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)", + loc->path, strerror (errno)); + goto out; + } + +done: + ret = syncop_setattr (to, loc, buf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + + ret = syncop_unlink (from, loc); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)", + loc->path, strerror (errno)); + +out: + if (dict) + dict_unref (dict); + + if (rsp_dict) + dict_unref (rsp_dict); + + return ret; +} + +/* + return values: + + -1 : failure + 0 : successfully migrated data + 1 : not a failure, but we can't migrate data as of now +*/ +int +dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag) +{ + int ret = -1; + struct iatt new_stbuf = {0,}; + struct iatt stbuf = {0,}; + struct iatt empty_iatt = {0,}; + ia_prot_t src_ia_prot = {0,}; + fd_t *src_fd = NULL; + fd_t *dst_fd = NULL; + dict_t *dict = NULL; + dict_t *xattr = NULL; + dict_t *xattr_rsp = NULL; + int file_has_holes = 0; + dht_conf_t *conf = this->private; + + gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s", + loc->path, from->name, to->name); + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } + + /* Phase 1 - Data migration is in progress from now on */ + ret = syncop_lookup (from, loc, dict, &stbuf, &xattr_rsp, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: lookup failed on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + /* we no more require this key */ + dict_del (dict, conf->link_xattr_name); + + /* preserve source mode, so set the same to the destination */ + src_ia_prot = stbuf.ia_prot; + + /* Check if file can be migrated */ + ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag); + if (ret) + goto out; + + /* Take care of the special files */ + if (!IA_ISREG (stbuf.ia_type)) { + /* Special files */ + ret = migrate_special_files (this, from, to, loc, &stbuf); + goto out; + } + + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr (from, loc, &xattr); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get xattr from %s (%s)", + loc->path, from->name, strerror (errno)); + + /* create the destination, with required modes/xattr */ + ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, + dict, &dst_fd, xattr); + if (ret) + goto out; + + ret = __dht_check_free_space (to, from, loc, &stbuf, flag); + if (ret) { + goto out; + } + + /* Open the source, and also update mode/xattr */ + ret = __dht_rebalance_open_src_file (from, to, loc, &stbuf, &src_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to open %s on %s", + loc->path, from->name); + goto out; + } + + + ret = syncop_fstat (from, src_fd, &stbuf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + /* Try to preserve 'holes' while migrating data */ + if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE)) + file_has_holes = 1; + + /* All I/O happens in this function */ + ret = __dht_rebalance_migrate_data (from, to, src_fd, dst_fd, + stbuf.ia_size, file_has_holes); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: failed to migrate data", + loc->path); + /* reset the destination back to 0 */ + ret = syncop_ftruncate (to, dst_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to reset target size back to 0 (%s)", + loc->path, strerror (errno)); + } + + ret = -1; + goto out; + } + + /* TODO: Sync the locks */ + + ret = syncop_fsync (to, dst_fd, 0); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to fsync on %s (%s)", + loc->path, to->name, strerror (errno)); + + + /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */ + + ret = syncop_fstat (from, src_fd, &new_stbuf); + if (ret < 0) { + /* Failed to get the stat info */ + gf_log (this->name, GF_LOG_ERROR, + "failed to fstat file %s on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + /* source would have both sticky bit and sgid bit set, reset it to 0, + and set the source permission on destination, if it was not set + prior to setting rebalance-modes in source */ + if (!src_ia_prot.sticky) + new_stbuf.ia_prot.sticky = 0; + + if (!src_ia_prot.sgid) + new_stbuf.ia_prot.sgid = 0; + + /* TODO: if the source actually had sticky bit, or sgid bit set, + we are not handling it */ + + ret = syncop_fsetattr (to, dst_fd, &new_stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + goto out; + } + + /* Because 'futimes' is not portable */ + ret = syncop_setattr (to, loc, &new_stbuf, + (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME), + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + + /* Make the source as a linkfile first before deleting it */ + empty_iatt.ia_prot.sticky = 1; + ret = syncop_fsetattr (from, src_fd, &empty_iatt, + GF_SET_ATTR_MODE, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, \ + "%s: failed to perform setattr on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate (from, src_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", + loc->path, from->name, strerror (errno)); + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + + /* Do a stat and check the gfid before unlink */ + ret = syncop_stat (from, loc, &empty_iatt); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to do a stat on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) { + /* take out the source from namespace */ + ret = syncop_unlink (from, loc); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform unlink on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + } + + ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "%s: failed to lookup the file on subvolumes (%s)", + loc->path, strerror (errno)); + } + + gf_log (this->name, GF_LOG_INFO, + "completed migration of %s from subvolume %s to %s", + loc->path, from->name, to->name); + + ret = 0; +out: + if (dict) + dict_unref (dict); + + if (xattr) + dict_unref (xattr); + if (xattr_rsp) + dict_unref (xattr_rsp); + + if (dst_fd) + syncop_close (dst_fd); + if (src_fd) + syncop_close (src_fd); + + return ret; +} + +static int +rebalance_task (void *data) +{ + int ret = -1; + dht_local_t *local = NULL; + call_frame_t *frame = NULL; + + frame = data; + + local = frame->local; + + /* This function is 'synchrounous', hence if it returns, + we are done with the task */ + ret = dht_migrate_file (THIS, &local->loc, local->rebalance.from_subvol, + local->rebalance.target_node, local->flags); + + return ret; +} + +static int +rebalance_task_completion (int op_ret, call_frame_t *sync_frame, void *data) +{ + int ret = -1; + uint64_t layout_int = 0; + dht_layout_t *layout = 0; + xlator_t *this = NULL; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + this = THIS; + local = sync_frame->local; + + if (!op_ret) { + /* Make sure we have valid 'layout' in inode ctx + after the operation */ + ret = inode_ctx_del (local->loc.inode, this, &layout_int); + if (!ret && layout_int) { + layout = (dht_layout_t *)(long)layout_int; + dht_layout_unref (this, layout); + } + + ret = dht_layout_preset (this, local->rebalance.target_node, + local->loc.inode); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set inode ctx", local->loc.path); + } + + if (op_ret == -1) { + /* Failure of migration process, mostly due to write process. + as we can't preserve the exact errno, lets say there was + no space to migrate-data + */ + op_errno = ENOSPC; + } + + if (op_ret == 1) { + /* migration didn't happen, but is not a failure, let the user + understand that he doesn't have permission to migrate the + file. + */ + op_ret = -1; + op_errno = EPERM; + } + + DHT_STACK_UNWIND (setxattr, sync_frame, op_ret, op_errno, NULL); + return 0; +} + +int +dht_start_rebalance_task (xlator_t *this, call_frame_t *frame) +{ + int ret = -1; + + ret = synctask_new (this->ctx->env, rebalance_task, + rebalance_task_completion, + frame, frame); + return ret; +} + +int +gf_listener_stop (xlator_t *this) +{ + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + int ret = 0; + + ctx = this->ctx; + GF_ASSERT (ctx); + cmd_args = &ctx->cmd_args; + if (cmd_args->sock_file) { + ret = unlink (cmd_args->sock_file); + if (ret && (ENOENT == errno)) { + ret = 0; + } + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to unlink listener " + "socket %s, error: %s", cmd_args->sock_file, + strerror (errno)); + } + return ret; +} + +void +dht_build_root_inode (xlator_t *this, inode_t **inode) +{ + inode_table_t *itable = NULL; + uuid_t root_gfid = {0, }; + + itable = inode_table_new (0, this); + if (!itable) + return; + + root_gfid[15] = 1; + *inode = inode_find (itable, root_gfid); +} + +void +dht_build_root_loc (inode_t *inode, loc_t *loc) +{ + loc->path = "/"; + loc->inode = inode; + loc->inode->ia_type = IA_IFDIR; + memset (loc->gfid, 0, 16); + loc->gfid[15] = 1; +} + + +/* return values: 1 -> error, bug ignore and continue + 0 -> proceed + -1 -> error, handle it */ +int32_t +gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag) +{ + /* if errno is not ENOSPC or ENOTCONN, we can still continue + with rebalance process */ + if ((errno != ENOSPC) || (errno != ENOTCONN)) + return 1; + + if (errno == ENOTCONN) { + /* Most probably mount point went missing (mostly due + to a brick down), say rebalance failure to user, + let him restart it if everything is fine */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + if (errno == ENOSPC) { + /* rebalance process itself failed, may be + remote brick went down, or write failed due to + disk full etc etc.. */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + return 0; +} + +static gf_boolean_t +gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size) +{ + gf_defrag_pattern_list_t *trav = NULL; + gf_boolean_t match = _gf_false; + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO ("dht", defrag, out); + + trav = defrag->defrag_pattern; + while (trav) { + if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) { + match = _gf_true; + break; + } + trav = trav->next; + } + + if ((match == _gf_true) && (size >= trav->size)) + ret = _gf_true; + + out: + return ret; +} + +/* We do a depth first traversal of directories. But before we move into + * subdirs, we complete the data migration of those directories whose layouts + * have been fixed + */ + +int +gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + off_t offset = 0; + dict_t *dict = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + char *uuid_str = NULL; + uuid_t node_uuid = {0,}; + int readdir_operrno = 0; + struct timeval dir_start = {0,}; + struct timeval end = {0,}; + double elapsed = {0,}; + struct timeval start = {0,}; + int32_t err = 0; + + gf_log (this->name, GF_LOG_INFO, "migrate data called on %s", + loc->path); + gettimeofday (&dir_start, NULL); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + goto out; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) { + + if (ret < 0) { + + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s." + " Aborting migrate-data", + strerror(readdir_operrno)); + goto out; + } + + /* Need to keep track of ENOENT errno, that means, there is no + need to send more readdirp() */ + readdir_operrno = errno; + + if (list_empty (&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (IA_ISDIR (entry->d_stat.ia_type)) + continue; + + defrag->num_files_lookedup++; + if (defrag->stats == _gf_true) { + gettimeofday (&start, NULL); + } + if (defrag->defrag_pattern && + (gf_defrag_pattern_match (defrag, entry->d_name, + entry->d_stat.ia_size) + == _gf_false)) { + continue; + } + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + continue; + } + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_NODE_UUID_KEY); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid for %s", entry_loc.path); + continue; + } + + ret = dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, + &uuid_str); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid from dict for %s", + entry_loc.path); + continue; + } + + if (uuid_parse (uuid_str, node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "uuid_parse " + "failed for %s", entry_loc.path); + continue; + } + + /* if file belongs to different node, skip migration + * the other node will take responsibility of migration + */ + if (uuid_compare (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_TRACE, "%s does not" + "belong to this node", entry_loc.path); + continue; + } + + uuid_str = NULL; + + dict_del (dict, GF_XATTR_NODE_UUID_KEY); + + + /* if distribute is present, it will honor this key. + * -1 is returned if distribute is not present or file + * doesn't have a link-file. If file has link-file, the + * path of link-file will be the value, and also that + * guarantees that file has to be mostly migrated */ + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_LINKINFO_KEY); + if (ret < 0) { + gf_log (this->name, GF_LOG_TRACE, "failed to " + "get link-to key for %s", + entry_loc.path); + continue; + } + + ret = syncop_setxattr (this, &entry_loc, migrate_data, + 0); + if (ret) { + err = op_errno; + /* errno is overloaded. See + * rebalance_task_completion () */ + if (err != ENOSPC) { + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data skipped for %s" + " due to space constraints", + entry_loc.path); + defrag->skipped +=1; + } else{ + gf_log (this->name, GF_LOG_ERROR, + "migrate-data failed for %s", + entry_loc.path); + defrag->total_failures +=1; + } + } + + if (ret == -1) { + op_errno = errno; + ret = gf_defrag_handle_migrate_error (op_errno, + defrag); + + if (!ret) + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data on %s failed: %s", + entry_loc.path, + strerror (op_errno)); + else if (ret == 1) + continue; + else if (ret == -1) + goto out; + } + + LOCK (&defrag->lock); + { + defrag->total_files += 1; + defrag->total_data += iatt.ia_size; + } + UNLOCK (&defrag->lock); + if (defrag->stats == _gf_true) { + gettimeofday (&end, NULL); + elapsed = (end.tv_sec - start.tv_sec) * 1e6 + + (end.tv_usec - start.tv_usec); + gf_log (this->name, GF_LOG_INFO, "Migration of " + "file:%s size:%"PRIu64" bytes took %.2f" + "secs", entry_loc.path, iatt.ia_size, + elapsed/1e6); + } + } + + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + + if (readdir_operrno == ENOENT) + break; + } + + gettimeofday (&end, NULL); + elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 + + (end.tv_usec - dir_start.tv_usec); + gf_log (this->name, GF_LOG_INFO, "Migration operation on dir %s took " + "%.2f secs", loc->path, elapsed/1e6); + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + return ret; + +} + + +int +gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout, dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + dict_t *dict = NULL; + off_t offset = 0; + struct iatt iatt = {0,}; + int readdirp_errno = 0; + + ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s", + loc->path); + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + ret = gf_defrag_migrate_data (this, defrag, loc, migrate_data); + if (ret) + goto out; + } + + gf_log (this->name, GF_LOG_TRACE, "fix layout called on %s", loc->path); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + ret = -1; + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&entries.list); + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) + { + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s" + ". Aborting fix-layout",strerror(errno)); + goto out; + } + + /* Need to keep track of ENOENT errno, that means, there is no + need to send more readdirp() */ + readdirp_errno = errno; + + if (list_empty (&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (!IA_ISDIR (entry->d_stat.ia_type)) + continue; + + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + continue; + } + + ret = syncop_setxattr (this, &entry_loc, fix_layout, + 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Setxattr " + "failed for %s", entry_loc.path); + defrag->defrag_status = + GF_DEFRAG_STATUS_FAILED; + defrag->total_failures ++; + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &entry_loc, + fix_layout, migrate_data); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Fix layout " + "failed for %s", entry_loc.path); + defrag->total_failures++; + goto out; + } + + } + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + if (readdirp_errno == ENOENT) + break; + } + + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + + return ret; + +} + + +int +gf_defrag_start_crawl (void *data) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = -1; + loc_t loc = {0,}; + struct iatt iatt = {0,}; + struct iatt parent = {0,}; + dict_t *fix_layout = NULL; + dict_t *migrate_data = NULL; + dict_t *status = NULL; + glusterfs_ctx_t *ctx = NULL; + + this = data; + if (!this) + goto out; + + ctx = this->ctx; + if (!ctx) + goto out; + + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + gettimeofday (&defrag->start_time, NULL); + dht_build_root_inode (this, &defrag->root_inode); + if (!defrag->root_inode) + goto out; + + dht_build_root_loc (defrag->root_inode, &loc); + + /* fix-layout on '/' first */ + + ret = syncop_lookup (this, &loc, NULL, &iatt, NULL, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "look up on / failed"); + goto out; + } + + fix_layout = dict_new (); + if (!fix_layout) { + ret = -1; + goto out; + } + + ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set dict str"); + goto out; + } + + ret = syncop_setxattr (this, &loc, fix_layout, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", + loc.path); + defrag->total_failures++; + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + migrate_data = dict_new (); + if (!migrate_data) { + ret = -1; + goto out; + } + if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) + ret = dict_set_str (migrate_data, + "distribute.migrate-data", "force"); + else + ret = dict_set_str (migrate_data, + "distribute.migrate-data", + "non-force"); + if (ret) + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout, + migrate_data); + if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) && + (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) { + defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + } + + + +out: + LOCK (&defrag->lock); + { + status = dict_new (); + gf_defrag_status_get (defrag, status); + if (ctx->notify) + ctx->notify (GF_EN_DEFRAG_STATUS, status); + if (status) + dict_unref (status); + defrag->is_exiting = 1; + } + UNLOCK (&defrag->lock); + + if (defrag) { + GF_FREE (defrag); + conf->defrag = NULL; + } + + return ret; +} + + +static int +gf_defrag_done (int ret, call_frame_t *sync_frame, void *data) +{ + gf_listener_stop (sync_frame->this); + + STACK_DESTROY (sync_frame->root); + kill (getpid(), SIGTERM); + return 0; +} + +void * +gf_defrag_start (void *data) +{ + int ret = -1; + call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + xlator_t *this = NULL; + + this = data; + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + + frame->root->pid = GF_CLIENT_PID_DEFRAG; + + defrag->pid = frame->root->pid; + + defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + + ret = synctask_new (this->ctx->env, gf_defrag_start_crawl, + gf_defrag_done, frame, this); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Could not create" + " task for rebalance"); +out: + return NULL; +} + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) +{ + int ret = 0; + uint64_t files = 0; + uint64_t size = 0; + uint64_t lookup = 0; + uint64_t failures = 0; + uint64_t skipped = 0; + char *status = ""; + double elapsed = 0; + struct timeval end = {0,}; + + + if (!defrag) + goto out; + + ret = 0; + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) + goto out; + + files = defrag->total_files; + size = defrag->total_data; + lookup = defrag->num_files_lookedup; + failures = defrag->total_failures; + skipped = defrag->skipped; + + gettimeofday (&end, NULL); + + elapsed = end.tv_sec - defrag->start_time.tv_sec; + + if (!dict) + goto log; + + ret = dict_set_uint64 (dict, "files", files); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set file count"); + + ret = dict_set_uint64 (dict, "size", size); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set size of xfer"); + + ret = dict_set_uint64 (dict, "lookups", lookup); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set lookedup file count"); + + + ret = dict_set_int32 (dict, "status", defrag->defrag_status); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set status"); + if (elapsed) { + ret = dict_set_double (dict, "run-time", elapsed); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set run-time"); + } + + ret = dict_set_uint64 (dict, "failures", failures); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set failure count"); + + ret = dict_set_uint64 (dict, "skipped", skipped); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set skipped file count"); +log: + switch (defrag->defrag_status) { + case GF_DEFRAG_STATUS_NOT_STARTED: + status = "not started"; + break; + case GF_DEFRAG_STATUS_STARTED: + status = "in progress"; + break; + case GF_DEFRAG_STATUS_STOPPED: + status = "stopped"; + break; + case GF_DEFRAG_STATUS_COMPLETE: + status = "completed"; + break; + case GF_DEFRAG_STATUS_FAILED: + status = "failed"; + break; + default: + break; + } + + gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " + "secs", status, elapsed); + gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" + PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: " + "%"PRIu64, files, size, lookup, failures, skipped); + + +out: + return 0; +} + +int +gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output) +{ + /* TODO: set a variable 'stop_defrag' here, it should be checked + in defrag loop */ + int ret = -1; + GF_ASSERT (defrag); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { + goto out; + } + + gf_log ("", GF_LOG_INFO, "Received stop command on rebalance"); + defrag->defrag_status = GF_DEFRAG_STATUS_STOPPED; + + if (output) + gf_defrag_status_get (defrag, output); + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index f6ed8769d..5d6f4f232 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should @@ -35,7 +26,8 @@ int dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -75,65 +67,130 @@ dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, unwind: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - local->stbuf.ia_ino = local->loc.inode->ino; - - local->preoldparent.ia_ino = local->loc.parent->ino; - local->postoldparent.ia_ino = local->loc.parent->ino; - - local->preparent.ia_ino = local->loc2.parent->ino; - local->postparent.ia_ino = local->loc2.parent->ino; - WIPE (&local->preoldparent); WIPE (&local->postoldparent); WIPE (&local->preparent); WIPE (&local->postparent); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preoldparent, &local->postoldparent, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, xdata); } return 0; } - int -dht_rename_dir_do (call_frame_t *frame, xlator_t *this) +dht_rename_hashed_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, + struct iatt *postoldparent, + struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) { - dht_local_t *local = NULL; dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int call_cnt = 0; + call_frame_t *prev = NULL; int i = 0; conf = this->private; local = frame->local; + prev = cookie; - if (local->op_ret == -1) - goto err; + if (op_ret == -1) { + /* TODO: undo the damage */ - local->call_cnt = conf->subvolume_cnt; - local->op_ret = 0; + gf_log (this->name, GF_LOG_INFO, + "rename %s -> %s on %s failed (%s)", + local->loc.path, local->loc2.path, + prev->this->name, strerror (op_errno)); + + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + /* TODO: construct proper stbuf for dir */ + /* + * FIXME: is this the correct way to build stbuf and + * parent bufs? + */ + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preoldparent, preoldparent, + prev->this); + dht_iatt_merge (this, &local->postoldparent, postoldparent, + prev->this); + dht_iatt_merge (this, &local->preparent, prenewparent, + prev->this); + dht_iatt_merge (this, &local->postparent, postnewparent, + prev->this); + + call_cnt = local->call_cnt = conf->subvolume_cnt - 1; + + if (!local->call_cnt) + goto unwind; for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == local->dst_hashed) + continue; STACK_WIND (frame, dht_rename_dir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); + if (!--call_cnt) + break; } + + return 0; +unwind: + WIPE (&local->preoldparent); + WIPE (&local->postoldparent); + WIPE (&local->preparent); + WIPE (&local->postparent); + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, + &local->preparent, &local->postparent, NULL); + + return 0; +} + + +int +dht_rename_dir_do (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret == -1) + goto err; + + local->op_ret = 0; + + STACK_WIND (frame, dht_rename_hashed_dir_cbk, + local->dst_hashed, + local->dst_hashed->fops->rename, + &local->loc, &local->loc2, NULL); return 0; err: DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); return 0; } int dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -162,7 +219,7 @@ dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -182,7 +239,7 @@ dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, dht_rename_readdir_cbk, prev->this, prev->this->fops->readdir, - local->fd, 4096, 0); + local->fd, 4096, 0, NULL); return 0; @@ -238,22 +295,54 @@ dht_rename_dir (call_frame_t *frame, xlator_t *this) STACK_WIND (frame, dht_rename_opendir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - &local->loc2, local->fd); + &local->loc2, local->fd, NULL); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } +#define DHT_MARK_FOP_INTERNAL(xattr) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ + " internal dict key for %s", local->loc.path); \ + } \ + }while (0) +int +dht_rename_done (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, &local->preparent, + &local->postparent, NULL); + return 0; +} int dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -282,10 +371,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, WIPE (&local->postparent); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); + dht_rename_done (frame, this); } out: @@ -303,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; - + dict_t *xattr = NULL; local = frame->local; this = frame->this; @@ -327,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame) if (!call_cnt) goto nolinks; + DHT_MARK_FOP_INTERNAL (xattr); + if (dst_hashed != src_hashed && dst_hashed != src_cached) { gf_log (this->name, GF_LOG_TRACE, "unlinking linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (src_cached != dst_hashed) { @@ -342,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc2); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); + return 0; nolinks: @@ -353,10 +444,40 @@ nolinks: WIPE (&local->preparent); WIPE (&local->postparent); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preoldparent, &local->postoldparent, &local->preparent, - &local->postparent); + &local->postparent, NULL); + + return 0; +} + + +int +dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + call_frame_t *prev = NULL; + dht_local_t *local = NULL; + + prev = cookie; + local = frame->local; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "link/file %s on %s failed (%s)", + local->loc.path, prev->this->name, strerror (op_errno)); + } + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } + DHT_STACK_DESTROY (frame); return 0; } @@ -366,7 +487,8 @@ int dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -375,6 +497,9 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; xlator_t *rename_subvol = NULL; + call_frame_t *link_frame = NULL; + dht_local_t *link_local = NULL; + dict_t *xattr = NULL; local = frame->local; prev = cookie; @@ -384,6 +509,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + if (local->linked == _gf_true) + FRAME_SU_UNDO (frame, dht_local_t); if (op_ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: rename on %s failed (%s)", local->loc.path, @@ -393,19 +520,45 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto cleanup; } - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, prev->this); + if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) { + link_frame = copy_frame (frame); + if (!link_frame) { + goto err; + } - local->stbuf.ia_ino = local->loc.inode->ino; + /* fop value sent as maxvalue because it is not used + anywhere in this case */ + link_local = dht_local_init (link_frame, &local->loc2, NULL, + GF_FOP_MAXVALUE); + if (!link_local) { + goto err; + } - local->preoldparent.ia_ino = local->loc.parent->ino; - local->postoldparent.ia_ino = local->loc.parent->ino; + if (link_local->loc.inode) + inode_unref (link_local->loc.inode); + link_local->loc.inode = inode_ref (local->loc.inode); + uuid_copy (link_local->gfid, local->loc.inode->gfid); + + dht_linkfile_create (link_frame, dht_rename_links_create_cbk, + this, src_cached, dst_hashed, + &link_local->loc); + } + +err: + /* Merge attrs only from src_cached. In case there of src_cached != + * dst_hashed, this ignores linkfile attrs. */ + if (prev->this == src_cached) { + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preoldparent, preoldparent, + prev->this); + dht_iatt_merge (this, &local->postoldparent, postoldparent, + prev->this); + dht_iatt_merge (this, &local->preparent, prenewparent, + prev->this); + dht_iatt_merge (this, &local->postparent, postnewparent, + prev->this); + } - local->preparent.ia_ino = local->loc2.parent->ino; - local->postparent.ia_ino = local->loc2.parent->ino; /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk * is called. since rename has already happened on rename_subvol, @@ -430,6 +583,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->call_cnt == 0) goto unwind; + DHT_MARK_FOP_INTERNAL (xattr); + if (src_cached != dst_hashed && src_cached != dst_cached) { gf_log (this->name, GF_LOG_TRACE, "deleting old src datafile %s @ %s", @@ -437,7 +592,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (src_hashed != rename_subvol && src_hashed != src_cached) { @@ -447,7 +602,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, dht_rename_unlink_cbk, src_hashed, src_hashed->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (dst_cached @@ -459,8 +614,10 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, dht_rename_unlink_cbk, dst_cached, dst_cached->fops->unlink, - &local->loc2); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); return 0; unwind: @@ -468,15 +625,16 @@ unwind: WIPE (&local->postoldparent); WIPE (&local->preparent); WIPE (&local->postparent); + if (xattr) + dict_unref (xattr); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); + dht_rename_done (frame, this); return 0; cleanup: + if (xattr) + dict_unref (xattr); dht_rename_cleanup (frame); return 0; @@ -510,9 +668,11 @@ dht_do_rename (call_frame_t *frame) "renaming %s => %s (%s)", local->loc.path, local->loc2.path, rename_subvol->name); + if (local->linked == _gf_true) + FRAME_SU_DO (frame, dht_local_t); STACK_WIND (frame, dht_rename_cbk, rename_subvol, rename_subvol->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); return 0; } @@ -522,7 +682,8 @@ int dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -537,7 +698,11 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "link/file on %s failed (%s)", prev->this->name, strerror (op_errno)); local->op_ret = -1; - local->op_errno = op_errno; + if (op_errno != ENOENT) + local->op_errno = op_errno; + } else if (local->src_cached == prev->this) { + /* merge of attr returned only from linkfile creation */ + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); } this_call_cnt = dht_frame_return (frame); @@ -558,6 +723,42 @@ cleanup: int +dht_rename_unlink_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_DEBUG, + "unlink of %s on %s failed (%s)", + local->loc2.path, prev->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + + if (local->op_ret == -1) + goto cleanup; + + dht_do_rename (frame); + + return 0; + +cleanup: + dht_rename_cleanup (frame); + + return 0; +} + + +int dht_rename_create_links (call_frame_t *frame) { dht_local_t *local = NULL; @@ -567,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; + dict_t *xattr = NULL; local = frame->local; @@ -577,8 +779,21 @@ dht_rename_create_links (call_frame_t *frame) dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; - if (src_cached == dst_cached) - goto nolinks; + DHT_MARK_FOP_INTERNAL (xattr); + + if (src_cached == dst_cached) { + if (dst_hashed == dst_cached) + goto nolinks; + + gf_log (this->name, GF_LOG_TRACE, + "unlinking dst linkfile %s @ %s", + local->loc2.path, dst_hashed->name); + + STACK_WIND (frame, dht_rename_unlink_links_cbk, + dst_hashed, dst_hashed->fops->unlink, + &local->loc2, 0, xattr); + return 0; + } if (dst_hashed != src_hashed && dst_hashed != src_cached) call_cnt++; @@ -593,24 +808,26 @@ dht_rename_create_links (call_frame_t *frame) "linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); memcpy (local->gfid, local->loc.inode->gfid, 16); - dht_linkfile_create (frame, dht_rename_links_cbk, - src_cached, dst_hashed, &local->loc); - } - - if (src_cached != dst_hashed) { - gf_log (this->name, GF_LOG_TRACE, - "link %s => %s (%s)", local->loc.path, - local->loc2.path, src_cached->name); - STACK_WIND (frame, dht_rename_links_cbk, - src_cached, src_cached->fops->link, - &local->loc, &local->loc2); - } + dht_linkfile_create (frame, dht_rename_links_cbk, this, + src_cached, dst_hashed, &local->loc); + } + + if (src_cached != dst_hashed) { + gf_log (this->name, GF_LOG_TRACE, + "link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + STACK_WIND (frame, dht_rename_links_cbk, + src_cached, src_cached->fops->link, + &local->loc, &local->loc2, xattr); + } nolinks: if (!call_cnt) { /* skip to next step */ dht_do_rename (frame); } + if (xattr) + dict_unref (xattr); return 0; } @@ -618,7 +835,7 @@ nolinks: int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { xlator_t *src_cached = NULL; xlator_t *src_hashed = NULL; @@ -663,17 +880,14 @@ dht_rename (call_frame_t *frame, xlator_t *this, if (newloc->inode) dst_cached = dht_subvol_get_cached (this, newloc->inode); - local = dht_local_init (frame); + local = dht_local_init (frame, oldloc, NULL, GF_FOP_RENAME); if (!local) { op_errno = ENOMEM; goto err; } - - ret = loc_copy (&local->loc, oldloc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } + /* cached_subvol will be set from dht_local_init, reset it to NULL, + as the logic of handling rename is different */ + local->cached_subvol = NULL; ret = loc_copy (&local->loc2, newloc); if (ret == -1) { @@ -703,7 +917,8 @@ dht_rename (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index ddd043dc8..3fe96b1c7 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -26,6 +17,50 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" +#include "glusterfs-acl.h" + +#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ + layout->list[i].start = srt; \ + layout->list[i].stop = srt + chunk - 1; \ + \ + gf_log (this->name, GF_LOG_TRACE, \ + "gave fix: %u - %u on %s for %s", \ + layout->list[i].start, layout->list[i].stop, \ + layout->list[i].xlator->name, path); \ + } while (0) + +#define DHT_RESET_LAYOUT_RANGE(layout) do { \ + int cnt = 0; \ + for (cnt = 0; cnt < layout->cnt; cnt++ ) { \ + layout->list[cnt].start = 0; \ + layout->list[cnt].stop = 0; \ + } \ + } while (0) + +static uint32_t +dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) +{ + if (o >= old->cnt || n >= new->cnt) + return 0; + + if (old->list[o].err > 0 || new->list[n].err > 0) + return 0; + + if (old->list[o].start == old->list[o].stop) { + return 0; + } + + if (new->list[n].start == new->list[n].stop) { + return 0; + } + + if ((old->list[o].start > new->list[n].stop) || + (old->list[o].stop < new->list[n].start)) + return 0; + + return min (old->list[o].stop, new->list[n].stop) - + max (old->list[o].start, new->list[n].start) + 1; +} int @@ -35,7 +70,7 @@ dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) local = frame->local; local->selfheal.dir_cbk (frame, NULL, frame->this, ret, - local->op_errno); + local->op_errno, NULL); return 0; } @@ -43,7 +78,7 @@ dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) int dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -82,18 +117,32 @@ dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int i) + dht_layout_t *layout, int i, + xlator_t *req_subvol) { xlator_t *subvol = NULL; dict_t *xattr = NULL; int ret = 0; xlator_t *this = NULL; int32_t *disk_layout = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; - - subvol = layout->list[i].xlator; + local = frame->local; + if (req_subvol) + subvol = req_subvol; + else + subvol = layout->list[i].xlator; this = frame->this; + GF_VALIDATE_OR_GOTO ("", this, err); + GF_VALIDATE_OR_GOTO (this->name, layout, err); + GF_VALIDATE_OR_GOTO (this->name, local, err); + GF_VALIDATE_OR_GOTO (this->name, subvol, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + xattr = get_new_dict (); if (!xattr) { goto err; @@ -107,8 +156,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, goto err; } - ret = dict_set_bin (xattr, "trusted.glusterfs.dht", - disk_layout, 4 * 4); + ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: (subvol %s) failed to set xattr dictionary", @@ -124,9 +172,12 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, dict_ref (xattr); + if (!uuid_is_null (local->gfid)) + uuid_copy (loc->gfid, local->gfid); + STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, subvol, subvol->fops->setxattr, - loc, xattr, 0); + loc, xattr, 0, NULL); dict_unref (xattr); @@ -136,14 +187,58 @@ err: if (xattr) dict_destroy (xattr); - if (disk_layout) - GF_FREE (disk_layout); + GF_FREE (disk_layout); dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, - -1, ENOMEM); + -1, ENOMEM, NULL); return 0; } +int +dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int i = 0; + int count = 0; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; + + local = frame->local; + this = frame->this; + conf = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "writing the new range for all subvolumes"); + + local->call_cnt = count = conf->subvolume_cnt; + + for (i = 0; i < layout->cnt; i++) { + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); + + if (--count == 0) + goto out; + } + /* if we are here, subvolcount > layout_count. subvols-per-directory + * option might be set here. We need to clear out layout from the + * non-participating subvolumes, else it will result in overlaps */ + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + if (--count == 0) + break; + } + } + + dht_layout_unref (this, dummy); +out: + return 0; +} int dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) @@ -152,14 +247,17 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) int missing_xattr = 0; int i = 0; xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; + conf = this->private; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) { /* err != -1 would mean xattr present on the directory - * or the directory is itself non existant. + * or the directory is non existent. * !layout->list[i].stop would mean layout absent */ @@ -183,18 +281,30 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) if (layout->list[i].err != -1 || !layout->list[i].stop) continue; - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--missing_xattr == 0) break; } + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + } + } + dht_layout_unref (this, dummy); +out: return 0; } int dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost) + struct iatt *statpost, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -235,6 +345,9 @@ dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, return 0; } + if (!uuid_is_null (local->gfid)) + uuid_copy (loc->gfid, local->gfid); + local->call_cnt = missing_attr; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == -1) { @@ -245,7 +358,7 @@ dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, STACK_WIND (frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, NULL); } } @@ -256,7 +369,8 @@ int dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -289,9 +403,6 @@ dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - if (prev->this == local->hashed_subvol) - local->ia_ino = local->stbuf.ia_ino; - dht_iatt_merge (this, &local->preparent, preparent, prev->this); dht_iatt_merge (this, &local->postparent, postparent, prev->this); @@ -305,6 +416,46 @@ out: return 0; } +void +dht_selfheal_dir_mkdir_setacl (dict_t *xattr, dict_t *dict) +{ + data_t *acl_default = NULL; + data_t *acl_access = NULL; + xlator_t *this = NULL; + int ret = -1; + + GF_ASSERT (xattr); + GF_ASSERT (dict); + + this = THIS; + GF_ASSERT (this); + + acl_default = dict_get (xattr, POSIX_ACL_DEFAULT_XATTR); + + if (!acl_default) { + gf_log (this->name, GF_LOG_DEBUG, + "ACL_DEFAULT xattr not present"); + goto cont; + } + ret = dict_set (dict, POSIX_ACL_DEFAULT_XATTR, acl_default); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not set ACL_DEFAULT xattr"); +cont: + acl_access = dict_get (xattr, POSIX_ACL_ACCESS_XATTR); + if (!acl_access) { + gf_log (this->name, GF_LOG_DEBUG, + "ACL_ACCESS xattr not present"); + goto out; + } + ret = dict_set (dict, POSIX_ACL_ACCESS_XATTR, acl_access); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not set ACL_ACCESS xattr"); + +out: + return; +} int dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, @@ -338,16 +489,19 @@ dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); if (ret) - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%s: failed to set gfid in dict", loc->path); } else if (local->params) { /* Send the dictionary from higher layers directly */ dict = dict_ref (local->params); } + /* Set acls */ + if (local->xattr && dict) + dht_selfheal_dir_mkdir_setacl (local->xattr, dict); if (!dict) - gf_log (this->name, GF_LOG_DEBUG, - "dict is NULL, need to make sure gfid's are same"); + gf_log (this->name, GF_LOG_WARNING, + "dict is NULL, need to make sure gfids are same"); for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == ENOENT || force) { @@ -361,7 +515,7 @@ dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, loc, st_mode_from_ia (local->stbuf.ia_prot, local->stbuf.ia_type), - dict); + 0, dict); } } @@ -380,7 +534,7 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, uint32_t hashval = 0; int ret = 0; - ret = dht_hash_compute (layout->type, loc->path, &hashval); + ret = dht_hash_compute (this, layout->type, loc->path, &hashval); if (ret == 0) { start = (hashval % layout->cnt); } @@ -388,117 +542,308 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, return start; } - -void -dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout) +static inline int +dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) { - xlator_t *this = NULL; - uint32_t chunk = 0; - int i = 0; - uint32_t start = 0; - int cnt = 0; - int err = 0; - int start_subvol = 0; + int i = 0; + int j = 0; + int err = 0; + int count = 0; + dht_conf_t *conf = NULL; - this = frame->this; + /* Gets in use only for replace-brick, remove-brick */ + conf = this->private; + for (i = 0; i < layout->cnt; i++) { + for (j = 0; j < conf->subvolume_cnt; j++) { + if (conf->decommissioned_bricks[j] && + conf->decommissioned_bricks[j] == layout->list[i].xlator) { + layout->list[i].err = EINVAL; + break; + } + } + } for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; - cnt++; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ + count++; + if (!err) + layout->list[i].err = -1; } } /* no subvolume has enough space, but can't stop directory creation */ - if (!cnt) { + if (!count || !new_layout) { for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; if (err == ENOSPC) { layout->list[i].err = -1; - cnt++; + count++; + } + } + } + + /* if layout->spread_cnt is set, check if it is <= available + * subvolumes (down brick and decommissioned bricks are considered + * un-availbale). Else return count (available up bricks) */ + count = ((layout->spread_cnt && + (layout->spread_cnt <= count)) ? + layout->spread_cnt : ((count) ? count : 1)); + + return count; +} + + +void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); + +void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); +void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x,y) table[x*new->cnt+y] + +void +dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, + dht_layout_t *new, dht_layout_t *old) +{ + int i = 0; + int j = 0; + uint32_t curr_overlap = 0; + uint32_t max_overlap = 0; + int max_overlap_idx = -1; + uint32_t overlap = 0; + uint32_t *table = NULL; + + dht_layout_sort_volname (old); + /* Now both old_layout->list[] and new_layout->list[] + are match the same xlators/subvolumes. i.e, + old_layout->[i] and new_layout->[i] are referring + to the same subvolumes + */ + + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap)*old->cnt*new->cnt); + if (!table) { + return; + } + memset(table,0,sizeof(overlap)*old->cnt*new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); + } + } + + for (i = 0; i < new->cnt; i++) { + if (new->list[i].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + if (new->list[j].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; + } } } + + if (max_overlap_idx != i) { + dht_layout_range_swap (new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i,j); + OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); + OV_ENTRY(max_overlap_idx,j) = overlap; + } + } + } +} + + +dht_layout_t * +dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + int i = 0; + xlator_t *this = NULL; + dht_layout_t *new_layout = NULL; + dht_conf_t *priv = NULL; + dht_local_t *local = NULL; + uint32_t subvol_down = 0; + int ret = 0; + + this = frame->this; + priv = this->private; + local = frame->local; + + if (layout->type == DHT_HASH_TYPE_DM_USER) { + gf_log (THIS->name, GF_LOG_DEBUG, "leaving %s alone", + loc->path); + goto done; + } + + new_layout = dht_layout_new (this, priv->subvolume_cnt); + if (!new_layout) + goto done; + + /* If a subvolume is down, do not re-write the layout. */ + ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL, + &subvol_down, NULL, NULL); + + if (subvol_down || (ret == -1)) { + gf_log (this->name, GF_LOG_WARNING, "%u subvolume(s) are down" + ". Skipping fix layout.", subvol_down); + GF_FREE (new_layout); + return NULL; + } + + for (i = 0; i < new_layout->cnt; i++) { + if (layout->list[i].err != ENOSPC) + new_layout->list[i].err = layout->list[i].err; + else + new_layout->list[i].err = -1; + + new_layout->list[i].xlator = layout->list[i].xlator; + } + + /* First give it a layout as though it is a new directory. This + ensures rotation to kick in */ + dht_layout_sort_volname (new_layout); + dht_selfheal_layout_new_directory (frame, loc, new_layout); + + /* Now selectively re-assign ranges only when it helps */ + dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout); + +done: + if (new_layout) { + /* Now that the new layout has all the proper layout, change the + inode context */ + dht_layout_set (this, loc->inode, new_layout); + + /* Make sure the extra 'ref' for existing layout is removed */ + dht_layout_unref (this, local->layout); + + local->layout = new_layout; } + return local->layout; +} + + +void +dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + xlator_t *this = NULL; + uint32_t chunk = 0; + int i = 0; + uint32_t start = 0; + int cnt = 0; + int err = 0; + int start_subvol = 0; + + this = frame->this; + + cnt = dht_get_layout_count (this, layout, 1); + chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); + /* clear out the range, as we are re-computing here */ + DHT_RESET_LAYOUT_RANGE (layout); for (i = start_subvol; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1) { - layout->list[i].start = start; - layout->list[i].stop = start + chunk - 1; - - start = start + chunk; - - gf_log (this->name, GF_LOG_TRACE, - "gave fix: %u - %u on %s for %s", - layout->list[i].start, layout->list[i].stop, - layout->list[i].xlator->name, loc->path); + if (err == -1 || err == ENOENT) { + DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, + cnt, loc->path); if (--cnt == 0) { layout->list[i].stop = 0xffffffff; - break; + goto done; } + start += chunk; } } for (i = 0; i < start_subvol; i++) { err = layout->list[i].err; - if (err == -1) { - layout->list[i].start = start; - layout->list[i].stop = start + chunk - 1; - - start = start + chunk; - - gf_log (this->name, GF_LOG_TRACE, - "gave fix: %u - %u on %s for %s", - layout->list[i].start, layout->list[i].stop, - layout->list[i].xlator->name, loc->path); + if (err == -1 || err == ENOENT) { + DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, + cnt, loc->path); if (--cnt == 0) { layout->list[i].stop = 0xffffffff; - break; + goto done; } + start += chunk; } } -} +done: + return; +} int dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - xlator_t *this = NULL; dht_local_t *local = NULL; - int missing = -1; - int down = -1; - int holes = -1; + uint32_t holes = 0; int ret = -1; int i = -1; - int overlaps = -1; + uint32_t overlaps = 0; - this = frame->this; - conf = this->private; local = frame->local; - missing = local->selfheal.missing; - down = local->selfheal.down; holes = local->selfheal.hole_cnt; overlaps = local->selfheal.overlaps_cnt; - if ((missing + down) == conf->subvolume_cnt) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; - } - - if (holes <= down) { - /* the down subvol might fill up the holes */ - ret = 0; - } - if (holes || overlaps) { dht_selfheal_layout_new_directory (frame, loc, layout); ret = 0; @@ -535,13 +880,35 @@ dht_selfheal_new_directory (call_frame_t *frame, return 0; } +int +dht_fix_directory_layout (call_frame_t *frame, + dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout) +{ + dht_local_t *local = NULL; + dht_layout_t *tmp_layout = NULL; + + local = frame->local; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref (frame->this, layout); + + /* No layout sorting required here */ + tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout); + if (!tmp_layout) { + return -1; + } + dht_fix_dir_xattr (frame, &local->loc, tmp_layout); + + return 0; +} + int dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, loc_t *loc, dht_layout_t *layout) { dht_local_t *local = NULL; - uint32_t holes = 0; uint32_t down = 0; uint32_t misc = 0; int ret = 0; @@ -553,11 +920,9 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, dht_layout_anomalies (this, loc, layout, &local->selfheal.hole_cnt, &local->selfheal.overlaps_cnt, - &local->selfheal.missing, - &local->selfheal.down, - &local->selfheal.misc); + NULL, &local->selfheal.down, + &local->selfheal.misc, NULL); - holes = local->selfheal.hole_cnt; down = local->selfheal.down; misc = local->selfheal.misc; @@ -565,14 +930,14 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, local->selfheal.layout = dht_layout_ref (this, layout); if (down) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%d subvolumes down -- not fixing", down); ret = 0; goto sorry_no_fix; } if (misc) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%d subvolumes have unrecoverable errors", misc); ret = 0; goto sorry_no_fix; @@ -582,7 +947,7 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, ret = dht_selfheal_dir_getafix (frame, loc, layout); if (ret == -1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "not able to form layout for the directory"); goto sorry_no_fix; } @@ -615,3 +980,50 @@ dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, return ret; } + +int +dht_dir_attr_heal (void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int ret = -1; + int i = 0; + + GF_VALIDATE_OR_GOTO ("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", local, out); + conf = this->private; + GF_VALIDATE_OR_GOTO ("dht", conf, out); + + call_cnt = conf->subvolume_cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (!subvol || (subvol == dht_first_up_subvol (this))) + continue; + ret = syncop_setattr (subvol, &local->loc, &local->stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret) + gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on" + " %s on %s subvol (%s)", local->loc.path, + subvol->name, strerror (errno)); + } +out: + return 0; +} + +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY (sync_frame); + return 0; +} diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c new file mode 100644 index 000000000..70aac7710 --- /dev/null +++ b/xlators/cluster/dht/src/dht-shared.c @@ -0,0 +1,758 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "statedump.h" +#include "dht-common.h" + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ +struct volume_options options[]; + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix) +{ + + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + if (!layout) + goto out; + if (!prefix) + goto out; + + gf_proc_dump_build_key(key, prefix, "cnt"); + gf_proc_dump_write(key, "%d", layout->cnt); + gf_proc_dump_build_key(key, prefix, "preset"); + gf_proc_dump_write(key, "%d", layout->preset); + gf_proc_dump_build_key(key, prefix, "gen"); + gf_proc_dump_write(key, "%d", layout->gen); + if (layout->type != IA_INVAL) { + gf_proc_dump_build_key(key, prefix, "inode type"); + gf_proc_dump_write(key, "%d", layout->type); + } + + if (!IA_ISDIR (layout->type)) + goto out; + + for (i = 0; i < layout->cnt; i++) { + gf_proc_dump_build_key(key, prefix,"list[%d].err", i); + gf_proc_dump_write(key, "%d", layout->list[i].err); + gf_proc_dump_build_key(key, prefix,"list[%d].start", i); + gf_proc_dump_write(key, "%u", layout->list[i].start); + gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); + gf_proc_dump_write(key, "%u", layout->list[i].stop); + if (layout->list[i].xlator) { + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.type", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->type); + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.name", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->name); + } + } + +out: + return; +} + + +int32_t +dht_priv_dump (xlator_t *this) +{ + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + dht_conf_t *conf = NULL; + int ret = -1; + + if (!this) + goto out; + + conf = this->private; + if (!conf) + goto out; + + ret = TRY_LOCK(&conf->subvolume_lock); + if (ret != 0) { + return ret; + } + + gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); + gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", + this->name); + gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + sprintf (key, "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, + conf->subvolumes[i]->name); + if (conf->file_layouts && conf->file_layouts[i]){ + sprintf (key, "file_layouts[%d]", i); + dht_layout_dump(conf->file_layouts[i], key); + } + if (conf->dir_layouts && conf->dir_layouts[i]) { + sprintf (key, "dir_layouts[%d]", i); + dht_layout_dump(conf->dir_layouts[i], key); + } + if (conf->subvolume_status) { + + sprintf (key, "subvolume_status[%d]", i); + gf_proc_dump_write(key, "%d", + (int)conf->subvolume_status[i]); + } + + } + + gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); + gf_proc_dump_write("gen", "%d", conf->gen); + gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); + gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); + gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); + if (conf ->du_stats) { + gf_proc_dump_write("du_stats.avail_percent", "%lf", + conf->du_stats->avail_percent); + gf_proc_dump_write("du_stats.avail_space", "%lu", + conf->du_stats->avail_space); + gf_proc_dump_write("du_stats.avail_inodes", "%lf", + conf->du_stats->avail_inodes); + gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log); + } + + if (conf->last_stat_fetch.tv_sec) + gf_proc_dump_write("last_stat_fetch", "%s", + ctime(&conf->last_stat_fetch.tv_sec)); + + UNLOCK(&conf->subvolume_lock); + +out: + return ret; +} + +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode) +{ + int ret = -1; + dht_layout_t *layout = NULL; + + if (!this) + goto out; + if (!inode) + goto out; + + ret = dht_inode_ctx_layout_get (inode, this, &layout); + + if ((ret != 0) || !layout) + return ret; + + gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); + dht_layout_dump(layout, "layout"); + +out: + return ret; +} + +void +dht_fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + conf = this->private; + this->private = NULL; + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf); + } +out: + return; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } +out: + return ret; +} + + +int +dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup (bricks); + node = strtok_r (dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp (conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = + conf->subvolumes[i]; + conf->decommission_subvols_cnt++; + gf_log (this->name, GF_LOG_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r (NULL, ",", &tmpstr); + } + + ret = 0; + conf->decommission_in_progress = 1; +out: + GF_FREE (dup_brick); + + return ret; +} + + +int +dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + int ret = -1; + + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } + + ret = 0; +out: + + return ret; +} +void +dht_init_regex (xlator_t *this, dict_t *odict, char *name, + regex_t *re, gf_boolean_t *re_valid) +{ + char *temp_str; + + if (dict_get_str (odict, name, &temp_str) != 0) { + if (strcmp(name,"rsync-hash-regex")) { + return; + } + temp_str = "^\\.(.+)\\.[^.]+$"; + } + + if (*re_valid) { + regfree(re); + *re_valid = _gf_false; + } + + if (!strcmp(temp_str,"none")) { + return; + } + + if (regcomp(re,temp_str,REG_EXTENDED) == 0) { + gf_log (this->name, GF_LOG_INFO, + "using regex %s = %s", name, temp_str); + *re_valid = _gf_true; + } + else { + gf_log (this->name, GF_LOG_WARNING, + "compiling regex %s failed", temp_str); + } +} + +int +dht_reconfigure (xlator_t *this, dict_t *options) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + gf_boolean_t search_unhashed; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", options, out); + + conf = this->private; + if (!conf) + return 0; + + if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean*/ + if (strcasecmp (temp_str, "auto")) { + if (!gf_string2boolean (temp_str, &search_unhashed)) { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured (%s)", + temp_str); + conf->search_unhashed = search_unhashed; + } else { + gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" + " lookup-unhashed should be boolean," + " not (%s), defaulting to (%d)", + temp_str, conf->search_unhashed); + //return -1; + ret = -1; + goto out; + } + } else { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured auto "); + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, + percent_or_size, out); + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100.0) + conf->disk_unit = 'p'; + + GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, + percent, out); + + GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, + options, uint32, out); + + GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options, + bool, out); + if (conf->defrag) { + GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats, + options, bool, out); + } + + if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto out; + } else { + ret = dht_decommissioned_remove (this, conf); + if (ret == -1) + goto out; + } + + dht_init_regex (this, options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = 0; +out: + return ret; +} + +static int +gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *num = NULL; + char *pattern_str = NULL; + char *pattern = NULL; + gf_defrag_pattern_list_t *temp_list = NULL; + gf_defrag_pattern_list_t *pattern_list = NULL; + + if (!this || !defrag || !data) + goto out; + + /* Get the pattern for pattern list. "pattern:<optional-size>" + * eg: *avi, *pdf:10MB, *:1TB + */ + pattern_str = strtok_r (data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup (pattern_str); + pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t), + 1); + if (!pattern_list) { + goto out; + } + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (!pattern) + goto out; + if (!num) { + if (gf_string2bytesize(pattern, &pattern_list->size) + == 0) { + pattern = "*"; + } + } else if (gf_string2bytesize (num, &pattern_list->size) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + memcpy (pattern_list->path_pattern, pattern, strlen (dup_str)); + + if (!defrag->defrag_pattern) + temp_list = NULL; + else + temp_list = defrag->defrag_pattern; + + pattern_list->next = temp_list; + + defrag->defrag_pattern = pattern_list; + pattern_list = NULL; + + GF_FREE (dup_str); + dup_str = NULL; + + pattern_str = strtok_r (NULL, ",", &tmp_str); + } + + ret = 0; +out: + if (ret) + GF_FREE (pattern_list); + GF_FREE (dup_str); + + return ret; +} + +int +dht_init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + char *node_uuid = NULL; + + + GF_VALIDATE_OR_GOTO ("dht", this, err); + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "Distribute needs more than one subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile"); + } + + conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); + if (!conf) { + goto err; + } + + ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); + + if (cmd) { + defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t), + gf_defrag_info_mt); + + GF_VALIDATE_OR_GOTO (this->name, defrag, err); + + LOCK_INIT (&defrag->lock); + + defrag->is_exiting = 0; + + conf->defrag = defrag; + + ret = dict_get_str (this->options, "node-uuid", &node_uuid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "node-uuid not " + "specified"); + goto err; + } + + if (uuid_parse (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse " + "glusterd node uuid"); + goto err; + } + + defrag->cmd = cmd; + + defrag->stats = _gf_false; + } + + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; + if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean */ + if (strcasecmp (temp_str, "auto")) + gf_string2boolean (temp_str, &conf->search_unhashed); + else + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + + GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, + err); + + GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); + + GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + err); + + GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, + err); + + conf->dir_spread_cnt = conf->subvolume_cnt; + GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, + uint32, err); + + GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, + bool, err); + + GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err); + + if (defrag) { + GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err); + if (dict_get_str (this->options, "rebalance-filter", &temp_str) + == 0) { + if (gf_defrag_pattern_list_fill (this, defrag, temp_str) + == -1) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse" + " rebalance-filter (%s)", temp_str); + goto err; + } + } + } + + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100) + conf->disk_unit = 'p'; + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto err; + } + + dht_init_regex (this, this->options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, this->options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + LOCK_INIT (&conf->layout_lock); + + conf->gen = 1; + + this->local_pool = mem_pool_new (dht_local_t, 512); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto err; + } + + GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err); + gf_asprintf (&conf->link_xattr_name, "%s.linkto", conf->xattr_name); + gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name); + if (!conf->link_xattr_name || !conf->wild_xattr_name) { + goto err; + } + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf->du_stats); + + GF_FREE (conf->defrag); + + GF_FREE (conf->xattr_name); + GF_FREE (conf->link_xattr_name); + GF_FREE (conf->wild_xattr_name); + + GF_FREE (conf); + } + + return -1; +} + + +struct volume_options options[] = { + { .key = {"lookup-unhashed"}, + .value = {"auto", "yes", "no", "enable", "disable", "1", "0", + "on", "off"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = "This option if set to ON, does a lookup through " + "all the sub-volumes, in case a lookup didn't return any result " + "from the hash subvolume. If set to OFF, it does not do a lookup " + "on the remaining subvolumes." + }, + { .key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "10%", + .description = "Percentage/Size of disk space, after which the " + "process starts balancing out the cluster, and logs will appear " + "in log files", + }, + { .key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "after system has only N% of inodes, warnings " + "starts to appear in log files", + }, + { .key = {"unhashed-sticky-bit"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + { .key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option if set to ON, forces the use of " + "readdirp, and hence also displays the stats of the files." + }, + { .key = {"assert-no-child-down"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON, in the event of " + "CHILD_DOWN, will call exit." + }, + { .key = {"directory-layout-spread"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies the directory layout spread." + }, + { .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + .description = "This option if set to ON, decommissions " + "the brick, so that no new data is allowed to be created " + "on that brick." + }, + { .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, + { .key = {"rebalance-stats"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON displays and logs the " + " time taken for migration of each file, during the rebalance " + "process. If set to OFF, the rebalance logs will only display the " + "time spent in each directory." + }, + { .key = {"readdir-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON enables the optimization " + "that allows DHT to requests non-first subvolumes to filter out " + "directory entries." + }, + { .key = {"rsync-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by rsync, to prevent relocation when the " + "file is renamed." + }, + { .key = {"extra-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by an application, to prevent relocation when " + "the file is renamed." + }, + { .key = {"rebalance-filter"}, + .type = GF_OPTION_TYPE_STR, + }, + + { .key = {"xattr-name"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "trusted.glusterfs.dht", + .description = "Base for extended attributes used by this " + "translator instance, to avoid conflicts with others above or " + "below it." + }, + + /* NUFA option */ + { .key = {"local-volume-name"}, + .type = GF_OPTION_TYPE_XLATOR + }, + + /* switch option */ + { .key = {"pattern.switch.case"}, + .type = GF_OPTION_TYPE_ANY + }, + + { .key = {NULL} }, +}; diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index c9b77d644..fc0ca2f77 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -23,478 +14,23 @@ #include "config.h" #endif -/* TODO: add NS locking */ - #include "statedump.h" -#include "dht-common.c" - -/* TODO: - - use volumename in xattr instead of "dht" - - use NS locks - - handle all cases in self heal layout reconstruction - - complete linkfile selfheal -*/ - - -void -dht_layout_dump (dht_layout_t *layout, const char *prefix) -{ - - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - - GF_VALIDATE_OR_GOTO ("dht", layout, out); - GF_VALIDATE_OR_GOTO ("dht", prefix, out); - - gf_proc_dump_build_key(key, prefix, "cnt"); - gf_proc_dump_write(key, "%d", layout->cnt); - gf_proc_dump_build_key(key, prefix, "preset"); - gf_proc_dump_write(key, "%d", layout->preset); - gf_proc_dump_build_key(key, prefix, "gen"); - gf_proc_dump_write(key, "%d", layout->gen); - gf_proc_dump_build_key(key, prefix, "type"); - gf_proc_dump_write(key, "%d", layout->type); - - for (i = 0; i < layout->cnt; i++) { - gf_proc_dump_build_key(key, prefix,"list[%d].err", i); - gf_proc_dump_write(key, "%d", layout->list[i].err); - gf_proc_dump_build_key(key, prefix,"list[%d].start", i); - gf_proc_dump_write(key, "%u", layout->list[i].start); - gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); - gf_proc_dump_write(key, "%u", layout->list[i].stop); - if (layout->list[i].xlator) { - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.type", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->type); - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.name", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->name); - } - } - -out: - return; -} - - -int32_t -dht_priv_dump (xlator_t *this) -{ - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - dht_conf_t *conf = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - - if (!conf) - return -1; - - ret = TRY_LOCK(&conf->subvolume_lock); - - if (ret != 0) { - gf_log("", GF_LOG_WARNING, "Unable to lock dht subvolume %s", - this->name); - return ret; - } - - gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); - gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", - this->name); - gf_proc_dump_build_key(key, key_prefix, "subvolume_cnt"); - gf_proc_dump_write(key,"%d", conf->subvolume_cnt); - for (i = 0; i < conf->subvolume_cnt; i++) { - gf_proc_dump_build_key(key, key_prefix, "subvolumes[%d]", i); - gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, - conf->subvolumes[i]->name); - if (conf->file_layouts && conf->file_layouts[i]){ - gf_proc_dump_build_key(key, key_prefix, - "file_layouts[%d]",i); - dht_layout_dump(conf->file_layouts[i], key); - } - if (conf->dir_layouts && conf->dir_layouts[i]) { - gf_proc_dump_build_key(key, key_prefix, - "dir_layouts[%d]",i); - dht_layout_dump(conf->dir_layouts[i], key); - } - if (conf->subvolume_status) { - gf_proc_dump_build_key(key, key_prefix, - "subvolume_status[%d]", i); - gf_proc_dump_write(key, "%d", - (int)conf->subvolume_status[i]); - } - - } - - gf_proc_dump_build_key(key, key_prefix,"default_dir_layout"); - dht_layout_dump(conf->default_dir_layout, key); - - gf_proc_dump_build_key(key, key_prefix, "search_unhashed"); - gf_proc_dump_write(key, "%d", conf->search_unhashed); - gf_proc_dump_build_key(key, key_prefix, "gen"); - gf_proc_dump_write(key, "%d", conf->gen); - gf_proc_dump_build_key(key, key_prefix, "min_free_disk"); - gf_proc_dump_write(key, "%lu", conf->min_free_disk); - gf_proc_dump_build_key(key, key_prefix, "disk_unit"); - gf_proc_dump_write(key, "%c", conf->disk_unit); - gf_proc_dump_build_key(key, key_prefix, "refresh_interval"); - gf_proc_dump_write(key, "%d", conf->refresh_interval); - gf_proc_dump_build_key(key, key_prefix, "unhashed_sticky_bit"); - gf_proc_dump_write(key, "%d", conf->unhashed_sticky_bit); - if (conf ->du_stats) { - gf_proc_dump_build_key(key, key_prefix, - "du_stats.avail_percent"); - gf_proc_dump_write(key, "%lf", conf->du_stats->avail_percent); - gf_proc_dump_build_key(key, key_prefix, - "du_stats.avail_space"); - gf_proc_dump_write(key, "%lu", conf->du_stats->avail_space); - gf_proc_dump_build_key(key, key_prefix, - "du_stats.log"); - gf_proc_dump_write(key, "%lu", conf->du_stats->log); - } - gf_proc_dump_build_key(key, key_prefix, "last_stat_fetch"); - gf_proc_dump_write(key, "%s", ctime(&conf->last_stat_fetch.tv_sec)); - - UNLOCK(&conf->subvolume_lock); - -out: - return ret; -} - -int32_t -dht_inodectx_dump (xlator_t *this, inode_t *inode) -{ - int ret = -1; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - dht_layout_t *layout = NULL; - uint64_t tmp_layout = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", inode, out); - - ret = inode_ctx_get (inode, this, &tmp_layout); - - if (ret != 0) - return ret; - - layout = (dht_layout_t *)(long)tmp_layout; - - if (!layout) - return -1; - - gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", - "%s.inode.%ld", this->name, inode->ino); - dht_layout_dump(layout, key_prefix); - -out: - return ret; -} - -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = dht_notify (this, event, data); - -out: - return ret; -} - -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - this->private = NULL; - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); - } -out: - return; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } -out: - return ret; -} - -int -validate_options (xlator_t *this, char **op_errstr) -{ - int ret = 0; - volume_opt_list_t *vol_opt = NULL; - volume_opt_list_t *tmp; - - if (!this) { - gf_log (this->name, GF_LOG_DEBUG, "'this' not a valid ptr"); - ret =-1; - goto out; - } - - if (list_empty (&this->volume_options)) - goto out; - - vol_opt = list_entry (this->volume_options.next, - volume_opt_list_t, list); - list_for_each_entry_safe (vol_opt, tmp, &this->volume_options, list) { - ret = validate_xlator_volume_options_attacherr (this, - vol_opt->given_opt, - op_errstr); - } - -out: - - return ret; -} - -int -reconfigure (xlator_t *this, dict_t *options) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - gf_boolean_t search_unhashed; - uint32_t temp_free_disk = 0; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", options, out); - - conf = this->private; - if (!conf) - return 0; - - if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean*/ - if (strcasecmp (temp_str, "auto")) { - if (!gf_string2boolean (temp_str, &search_unhashed)) { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unahashed reconfigured (%s)", - temp_str); - conf->search_unhashed = search_unhashed; - } else { - gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" - " lookup-unahashed should be boolean," - " not (%s), defaulting to (%d)", - temp_str, conf->search_unhashed); - //return -1; - ret = -1; - goto out; - } - } else { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unahashed reconfigured auto "); - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - } - - if (dict_get_str (options, "min-free-disk", &temp_str) == 0) { - if (gf_string2percent (temp_str, &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - } - } else { - gf_string2bytesize (temp_str, &conf->min_free_disk); - conf->disk_unit = 'b'; - } - - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " min-free-disk reconfigured to %s", - temp_str); - } - ret = 0; -out: - return ret; -} - -int -init (xlator_t *this) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - uint32_t temp_free_disk = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, err); - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "Distribute needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - conf->unhashed_sticky_bit = 0; - - if (dict_get_str (this->options, "unhashed-sticky-bit", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->unhashed_sticky_bit); - } - - conf->use_readdirp = 1; - - if (dict_get_str (this->options, "use-readdirp", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->use_readdirp); - } - - conf->disk_unit = 'p'; - conf->min_free_disk = 10; - - if (dict_get_str (this->options, "min-free-disk", &temp_str) == 0) { - if (gf_string2percent (temp_str, &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - } - } else { - gf_string2bytesize (temp_str, &conf->min_free_disk); - conf->disk_unit = 'b'; - } - } - - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - this->private = conf; - - return 0; - -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - - return -1; -} +#include "dht-common.h" +class_methods_t class_methods = { + .init = dht_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; struct xlator_fops fops = { .lookup = dht_lookup, .mknod = dht_mknod, .create = dht_create, - .stat = dht_stat, - .fstat = dht_fstat, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .access = dht_access, - .readlink = dht_readlink, - .setxattr = dht_setxattr, - .fsetxattr = dht_fsetxattr, - .getxattr = dht_getxattr, - .removexattr = dht_removexattr, .open = dht_open, - .readv = dht_readv, - .writev = dht_writev, - .flush = dht_flush, - .fsync = dht_fsync, .statfs = dht_statfs, - .lk = dht_lk, .opendir = dht_opendir, .readdir = dht_readdir, .readdirp = dht_readdirp, @@ -505,14 +41,38 @@ struct xlator_fops fops = { .mkdir = dht_mkdir, .rmdir = dht_rmdir, .rename = dht_rename, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, .entrylk = dht_entrylk, .fentrylk = dht_fentrylk, + + /* Inode read operations */ + .stat = dht_stat, + .fstat = dht_fstat, + .access = dht_access, + .readlink = dht_readlink, + .getxattr = dht_getxattr, + .fgetxattr = dht_fgetxattr, + .readv = dht_readv, + .flush = dht_flush, + .fsync = dht_fsync, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .lk = dht_lk, + + /* Inode write operations */ + .fremovexattr = dht_fremovexattr, + .removexattr = dht_removexattr, + .setxattr = dht_setxattr, + .fsetxattr = dht_fsetxattr, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .writev = dht_writev, .xattrop = dht_xattrop, .fxattrop = dht_fxattrop, .setattr = dht_setattr, .fsetattr = dht_fsetattr, + .fallocate = dht_fallocate, + .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { @@ -526,22 +86,4 @@ struct xlator_cbks cbks = { // .releasedir = dht_releasedir, .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {"unhashed-sticky-bit"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"use-readdirp"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {NULL} }, -}; +; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 6f14362f4..e934acdf0 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -23,10 +14,12 @@ #include "config.h" #endif -#include "dht-common.c" +#include "dht-common.h" /* TODO: all 'TODO's in dht.c holds good */ +extern struct volume_options options[]; + int nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, @@ -44,7 +37,6 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int call_cnt = 0; int ret = 0; - conf = this->private; prev = cookie; @@ -62,15 +54,12 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { /* non-directory and not a linkfile */ - - dht_itransform (this, prev->this, stbuf->ia_ino, - &stbuf->ia_ino); - ret = dht_layout_preset (this, prev->this, inode); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, @@ -145,7 +134,7 @@ out: err: DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, - inode, stbuf, xattr, NULL); + inode, stbuf, xattr, postparent); return 0; } @@ -154,7 +143,6 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; xlator_t *subvol = NULL; dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -173,21 +161,12 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, conf = this->private; - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); if (!local) { op_errno = ENOMEM; goto err; } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "copying location failed for path=%s", - loc->path); - goto err; - } - if (xattr_req) { local->xattr_req = dict_ref (xattr_req); } else { @@ -195,14 +174,11 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - cached_subvol = dht_subvol_get_cached (this, local->loc.inode); - local->cached_subvol = cached_subvol; local->hashed_subvol = hashed_subvol; if (is_revalidate (loc)) { - local->layout = layout = dht_layout_get (this, loc->inode); - + layout = local->layout; if (!layout) { gf_log (this->name, GF_LOG_DEBUG, "revalidate without cache. path=%s", @@ -219,8 +195,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, goto do_fresh_lookup; } - local->inode = inode_ref (loc->inode); - local->ia_ino = loc->inode->ino; + local->inode = inode_ref (loc->inode); local->call_cnt = layout->cnt; call_cnt = local->call_cnt; @@ -229,7 +204,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, * revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -250,7 +225,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } else { do_fresh_lookup: ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -259,7 +234,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -278,7 +253,8 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); return 0; } @@ -287,7 +263,7 @@ nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; @@ -298,28 +274,27 @@ nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd, - local->params); + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); return 0; err: DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } int nufa_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; xlator_t *subvol = NULL; xlator_t *avail_subvol = NULL; int op_errno = -1; - int ret = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -329,7 +304,7 @@ nufa_create (call_frame_t *frame, xlator_t *this, dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); + local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); if (!local) { op_errno = ENOMEM; goto err; @@ -348,26 +323,19 @@ nufa_create (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (subvol != avail_subvol) { /* create a link file instead of actual file */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } - - local->fd = fd_ref (fd); local->params = dict_ref (params); local->mode = mode; local->flags = flags; - + local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - nufa_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, nufa_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } @@ -376,14 +344,14 @@ nufa_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -392,41 +360,45 @@ int nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, local->cached_subvol->fops->mknod, &local->loc, local->mode, local->rdev, - local->params); + local->umask, local->params); return 0; } - +err: WIPE (postparent); WIPE (preparent); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, xdata); return 0; } int nufa_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; xlator_t *subvol = NULL; xlator_t *avail_subvol = NULL; int op_errno = -1; - int ret = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -436,7 +408,7 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); if (!local) { op_errno = ENOMEM; goto err; @@ -456,23 +428,20 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (avail_subvol != subvol) { /* Create linkfile first */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } local->params = dict_ref (params); local->mode = mode; + local->umask = umask; local->rdev = rdev; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, + dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this, avail_subvol, subvol, loc); return 0; } @@ -480,208 +449,185 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } -int -notify (xlator_t *this, int event, void *data, ...) +gf_boolean_t +same_first_part (char *str1, char term1, char *str2, char term2) { - int ret = -1; + gf_boolean_t ended1; + gf_boolean_t ended2; + + for (;;) { + ended1 = ((*str1 == '\0') || (*str1 == term1)); + ended2 = ((*str2 == '\0') || (*str2 == term2)); + if (ended1 && ended2) { + return _gf_true; + } + if (ended1 || ended2 || (*str1 != *str2)) { + return _gf_false; + } + ++str1; + ++str2; + } +} - ret = dht_notify (this, event, data); +typedef struct nufa_args { + xlator_t *this; + char *volname; + gf_boolean_t addr_match; +} nufa_args_t; + +static void +nufa_find_local_brick (xlator_t *xl, void *data) +{ + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; + + if (strcmp (xl->name, local_volname) == 0) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s", + local_volname); + return; + } + + if (!addr_match) + return; + + ret = dict_get_str (xl->options, "remote-host", &brick_host); + if ((ret == 0) && + (gf_is_same_address (local_volname, brick_host) || + gf_is_local_addr (brick_host))) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using the first local " + "subvol %s", xl->name); + return; + } - return ret; } -void -fini (xlator_t *this) +static void +nufa_to_dht (xlator_t *this) { - int i = 0; - dht_conf_t *conf = NULL; + GF_ASSERT (this); + GF_ASSERT (this->fops); - conf = this->private; - - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; +} - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); +int +nufa_find_local_subvol (xlator_t *this, + void (*fn) (xlator_t *each, void* data), void *data) +{ + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first (this, fn, data); + if (!conf->private) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local " + "brick"); + return -1; + } - if (conf->subvolumes) - GF_FREE (conf->subvolumes); + candidate = conf->private; + trav = candidate->parents; + while (trav) { - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); + parent = trav->xlator; + if (strcmp (parent->type, "cluster/nufa") == 0) { + gf_log (this->name, GF_LOG_INFO, "Found local subvol, " + "%s", candidate->name); + ret = 0; + conf->private = candidate; + break; + } - GF_FREE (conf); + candidate = parent; + trav = parent->parents; } - return; + return ret; } int -init (xlator_t *this) +nufa_init (xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_list_t *trav = NULL; data_t *data = NULL; char *local_volname = NULL; - char *temp_str = NULL; int ret = -1; - int i = 0; char my_hostname[256]; - uint32_t temp_free_disk = 0; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = {0, }; - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "NUFA needs more than one subvolume"); - return -1; + ret = dht_init(this); + if (ret) { + return ret; } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } + if ((data = dict_get (this->options, "local-volume-name"))) { + local_volname = data->data; - conf = GF_CALLOC (1, sizeof (*conf), - gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } + } else { + addr_match = _gf_true; + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret == 0) + local_volname = my_hostname; - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; } - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args); + if (ret) { + gf_log (this->name, GF_LOG_INFO, + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht (this); } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); - } - - if (ret == 0) - local_volname = my_hostname; - - data = dict_get (this->options, "local-volume-name"); - if (data) { - local_volname = data->data; - } - - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, local_volname) == 0) - break; - trav = trav->next; - } - - if (!trav) { - gf_log (this->name, GF_LOG_ERROR, - "Could not find subvolume named '%s'. " - "Please define volume with the name as the hostname " - "or override it with 'option local-volume-name'", - local_volname); - goto err; - } - /* The volume specified exists */ - conf->private = trav->xlator; - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } - } - - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } - - this->private = conf; - return 0; +} -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - return -1; -} +class_methods_t class_methods = { + .init = nufa_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; struct xlator_fops fops = { @@ -728,19 +674,3 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"local-volume-name"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, -}; diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index 20356e24e..d3ea90ba8 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -23,7 +14,7 @@ #include "config.h" #endif -#include "dht-common.c" +#include "dht-common.h" #include "dht-mem-types.h" #include <sys/time.h> @@ -31,6 +22,8 @@ #include <fnmatch.h> #include <string.h> +extern struct volume_options options[]; + struct switch_sched_array { xlator_t *xl; int32_t eligible; @@ -76,29 +69,37 @@ get_switch_matching_subvol (const char *path, dht_conf_t *conf, struct switch_struct *cond = NULL; struct switch_struct *trav = NULL; char *pathname = NULL; - int idx = 0; + int idx = 0; + xlator_t *subvol = NULL; cond = conf->private; + subvol = hashed_subvol; if (!cond) - return hashed_subvol; + goto out; - trav = cond; pathname = gf_strdup (path); + if (!pathname) + goto out; + + trav = cond; while (trav) { if (fnmatch (trav->path_pattern, pathname, FNM_NOESCAPE) == 0) { for (idx = 0; idx < trav->num_child; idx++) { if (trav->array[idx].xl == hashed_subvol) - return hashed_subvol; + goto out; } idx = trav->node_index++; trav->node_index %= trav->num_child; - return trav->array[idx].xl; + subvol = trav->array[idx].xl; + goto out; } trav = trav->next; } +out: GF_FREE (pathname); - return hashed_subvol; + + return subvol; } @@ -136,15 +137,13 @@ switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { /* non-directory and not a linkfile */ - dht_itransform (this, prev->this, stbuf->ia_ino, - &stbuf->ia_ino); - ret = dht_layout_preset (this, prev->this, inode); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, @@ -249,21 +248,12 @@ switch_lookup (call_frame_t *frame, xlator_t *this, conf = this->private; - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); if (!local) { op_errno = ENOMEM; goto err; } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "copying location failed for path=%s", - loc->path); - goto err; - } - if (xattr_req) { local->xattr_req = dict_ref (xattr_req); } else { @@ -271,14 +261,12 @@ switch_lookup (call_frame_t *frame, xlator_t *this, } hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - cached_subvol = dht_subvol_get_cached (this, local->loc.inode); + cached_subvol = local->cached_subvol; - local->cached_subvol = cached_subvol; local->hashed_subvol = hashed_subvol; if (is_revalidate (loc)) { - local->layout = layout = dht_layout_get (this, loc->inode); - + layout = local->layout; if (!layout) { gf_log (this->name, GF_LOG_DEBUG, "revalidate without cache. path=%s", @@ -296,7 +284,6 @@ switch_lookup (call_frame_t *frame, xlator_t *this, } local->inode = inode_ref (loc->inode); - local->ia_ino = loc->inode->ino; local->call_cnt = layout->cnt; call_cnt = local->call_cnt; @@ -305,11 +292,11 @@ switch_lookup (call_frame_t *frame, xlator_t *this, * attribute, revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); + "failed to set dict value for %s", + conf->xattr_name); for (i = 0; i < layout->cnt; i++) { subvol = layout->list[i].xlator; @@ -324,18 +311,18 @@ switch_lookup (call_frame_t *frame, xlator_t *this, } else { do_fresh_lookup: ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); + "failed to set dict value for %s", + conf->xattr_name); ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht.linkto"); + "failed to set dict value for %s", + conf->link_xattr_name); if (!hashed_subvol) { gf_log (this->name, GF_LOG_DEBUG, @@ -381,7 +368,8 @@ switch_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, + NULL, NULL, NULL, NULL); return 0; } @@ -390,7 +378,7 @@ switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; @@ -401,28 +389,27 @@ switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd, - local->params); + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); return 0; err: DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } int switch_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; xlator_t *subvol = NULL; xlator_t *avail_subvol = NULL; int op_errno = -1; - int ret = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -432,7 +419,7 @@ switch_create (call_frame_t *frame, xlator_t *this, dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); + local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); if (!local) { op_errno = ENOMEM; goto err; @@ -450,25 +437,18 @@ switch_create (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (subvol != avail_subvol) { /* create a link file instead of actual file */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } - - local->fd = fd_ref (fd); local->mode = mode; local->flags = flags; - + local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - switch_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, switch_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } @@ -477,14 +457,14 @@ switch_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -493,38 +473,42 @@ int switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, local->cached_subvol->fops->mknod, &local->loc, local->mode, local->rdev, - local->params); + local->umask, local->params); return 0; } - +err: DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, xdata); return 0; } int -switch_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) +switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; xlator_t *subvol = NULL; xlator_t *avail_subvol = NULL; int op_errno = -1; - int ret = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -534,7 +518,7 @@ switch_mknod (call_frame_t *frame, xlator_t *this, dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); + local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); if (!local) { op_errno = ENOMEM; goto err; @@ -553,59 +537,45 @@ switch_mknod (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (avail_subvol != subvol) { /* Create linkfile first */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } local->params = dict_ref (params); local->mode = mode; + local->umask = umask; local->rdev = rdev; local->cached_subvol = avail_subvol; dht_linkfile_create (frame, switch_mknod_linkfile_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); return 0; } gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; -} - void -fini (xlator_t *this) +switch_fini (xlator_t *this) { - int i = 0; dht_conf_t *conf = NULL; struct switch_struct *trav = NULL; struct switch_struct *prev = NULL; @@ -616,33 +586,14 @@ fini (xlator_t *this) trav = (struct switch_struct *)conf->private; conf->private = NULL; while (trav) { - if (trav->array) - GF_FREE (trav->array); + GF_FREE (trav->array); prev = trav; trav = trav->next; GF_FREE (prev); } - - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); } - return; + dht_fini(this); } int @@ -702,17 +653,20 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, dup_str = gf_strdup (switch_str); switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), gf_switch_mt_switch_struct); - if (!switch_opt) + if (!switch_opt) { + GF_FREE (dup_str); goto err; + } pattern = strtok_r (dup_str, ":", &tmp_str1); childs = strtok_r (NULL, ":", &tmp_str1); if (strncmp (pattern, "*", 2) == 0) { - gf_log ("switch", GF_LOG_NORMAL, + gf_log ("switch", GF_LOG_INFO, "'*' pattern will be taken by default " "for all the unconfigured child nodes," " hence neglecting current option"); switch_str = strtok_r (NULL, ";", &tmp_str); + GF_FREE (switch_opt); GF_FREE (dup_str); continue; } @@ -785,6 +739,7 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, /* First entry */ switch_buf = switch_opt; } + switch_opt = NULL; switch_str = strtok_r (NULL, ";", &tmp_str); } @@ -841,19 +796,20 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, /* First entry */ switch_buf = switch_opt; } + switch_opt = NULL; } /* */ conf->private = switch_buf; return 0; err: + GF_FREE (switch_buf_array); + GF_FREE (switch_opt); + if (switch_buf) { - if (switch_buf_array) - GF_FREE (switch_buf_array); trav = switch_buf; while (trav) { - if (trav->array) - GF_FREE (trav->array); + GF_FREE (trav->array); switch_opt = trav; trav = trav->next; GF_FREE (switch_opt); @@ -863,68 +819,18 @@ err: } -int -init (xlator_t *this) +int32_t +switch_init (xlator_t *this) { dht_conf_t *conf = NULL; data_t *data = NULL; - char *temp_str = NULL; int ret = -1; - int i = 0; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "SWITCH needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t); - if (!conf) { - goto err; - } - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - conf->unhashed_sticky_bit = 0; - if (dict_get_str (this->options, "unhashed-sticky-bit", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->unhashed_sticky_bit); - } - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } + ret = dht_init(this); + if (ret) { + return ret; } + conf = this->private; data = dict_get (this->options, "pattern.switch.case"); if (data) { @@ -935,59 +841,23 @@ init (xlator_t *this) } } - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_switch_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } - this->private = conf; - return 0; err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - + dht_fini(this); return -1; } +class_methods_t class_methods = { + .init = switch_init, + .fini = switch_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; + + struct xlator_fops fops = { .lookup = switch_lookup, .create = switch_create, @@ -1032,19 +902,3 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"pattern.switch.case"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, -}; |
