diff options
Diffstat (limited to 'xlators/cluster')
112 files changed, 52975 insertions, 25948 deletions
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index a6ddb3564..6e883e565 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = unify stripe afr dht ha map +SUBDIRS = stripe afr dht nsr-server nsr-recon nsr-client CLEANFILES = diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 1bde9e5ba..ea5a90abb 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,20 +1,38 @@ -xlator_LTLIBRARIES = afr.la +xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -afr_la_LDFLAGS = -module -avoidversion +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ + afr-read-txn.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ + afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ + afr-self-heal-name.c + +afr_la_LDFLAGS = -module -avoid-version +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h +pump_la_LDFLAGS = -module -avoid-version +pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c +pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ + afr-common.c afr-self-heald.h pump.h \ + $(top_builddir)/xlators/lib/src/libxlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CFLAGS = -Wall $(GF_CFLAGS) -CLEANFILES = +CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/replicate.so install-data-hook: - ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
\ No newline at end of file + ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c new file mode 100644 index 000000000..164a651ba --- /dev/null +++ b/xlators/cluster/afr/src/afr-common.c @@ -0,0 +1,3629 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" +#include "statedump.h" +#include "inode.h" + +#include "fd.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heald.h" + + +call_frame_t * +afr_copy_frame (call_frame_t *base) +{ + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; + + frame = copy_frame (base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + AFR_STACK_DESTROY (frame); + return NULL; + } + + return frame; +} + +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. + */ + +int +__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) +{ + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; + + priv = this->private; + + ret = __inode_ctx_get (inode, this, &val); + if (ret < 0) + return ret; + + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; + + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } + + if (event_p) + *event_p = event; + return ret; +} + + +int +__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) +{ + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } + + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); + + return __inode_ctx_set (inode, this, &val); +} + + +int +__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) +{ + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + + ret = __inode_ctx_get (inode, this, &val); + (void) ret; + + metadatamap = (val & 0x000000000000ffff) >> 0; + datamap = (val & 0x00000000ffff0000) >> 16; + event = 0; + + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); + + return __inode_ctx_set (inode, this, &val); +} + + +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) +{ + afr_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small (inode, this, data, + metadata, event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; + + return ret; +} + + +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) +{ + afr_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small (inode, this, data, + metadata, event); + else + ret = -1; + + return ret; +} + + +int +__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_reset_small (inode, this); + else + ret = -1; + + return ret; +} + + +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get (inode, this, data, + metadata, event_p); + } + UNLOCK(&inode->lock); + + return ret; +} + + +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set (inode, this, data, metadata, + event); + } + UNLOCK(&inode->lock); + + return ret; +} + + +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_reset (inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + + +int +afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type (type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy (pending, pending_raw, sizeof(pending)); + + if (ntoh32 (pending[idx])) + accused[i] = 1; + } + + return 0; +} + + +int +afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } + + return 0; +} + + +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_accused = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + if (replies[i].op_ret == -1) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + afr_accused_fill (this, replies[i].xdata, data_accused, + (inode->ia_type == IA_IFDIR) ? + AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); + + afr_accused_fill (this, replies[i].xdata, + metadata_accused, AFR_METADATA_TRANSACTION); + + } + + if (inode->ia_type != IA_IFDIR) + afr_accuse_smallfiles (this, replies, data_accused); + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + + afr_inode_read_subvol_set (inode, this, data_readable, + metadata_readable, event_generation); + return ret; +} + + + +int +afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +{ + if (heal) + STACK_DESTROY (heal->root); + return 0; +} + +int +afr_inode_refresh_err (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } + + err = afr_final_errno (local, priv); +ret: + return -err; +} + + +int +afr_refresh_selfheal_wrap (void *opaque) +{ + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + int err = 0; + + local = frame->local; + this = frame->this; + + afr_selfheal (frame->this, local->refreshinode->gfid); + + afr_selfheal_unlocked_discover (frame, local->refreshinode, + local->refreshinode->gfid, + local->replies); + + afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + local->refreshfn (frame, this, err); + + return 0; +} + + +gf_boolean_t +afr_selfheal_enabled (xlator_t *this) +{ + afr_private_t *priv = NULL; + gf_boolean_t data = _gf_false; + + priv = this->private; + + gf_string2boolean (priv->data_self_heal, &data); + + return data || priv->metadata_self_heal || priv->entry_self_heal; +} + + + +int +afr_inode_refresh_done (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + int err = 0; + + local = frame->local; + + ret = afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + if (ret && afr_selfheal_enabled (this)) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto refresh_done; + } else { + refresh_done: + local->refreshfn (frame, this, err); + } + + return 0; +} + + +int +afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *par) +{ + afr_local_t *local = NULL; + int call_child = (long) cookie; + int call_count = 0; + + local = frame->local; + + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + local->replies[call_child].postparent = *par; + local->replies[call_child].xdata = dict_ref (xdata); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_inode_refresh_done (frame, this); + + return 0; +} + + +int +afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, dict_t *xdata) +{ + loc_t loc = {0, }; + afr_private_t *priv = NULL; + + priv = this->private; + + loc.inode = inode; + uuid_copy (loc.gfid, inode->gfid); + + STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; +} + + +int +afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t *xdata = NULL; + + priv = this->private; + local = frame->local; + + afr_replies_wipe (local, priv); + + xdata = dict_new (); + if (!xdata) { + afr_inode_refresh_done (frame, this); + return 0; + } + + if (afr_xattr_req_prepare (this, xdata) != 0) { + dict_unref (xdata); + afr_inode_refresh_done (frame, this); + return 0; + } + + local->call_count = AFR_COUNT (local->child_up, priv->child_count); + + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + afr_inode_refresh_subvol (frame, this, i, local->refreshinode, + xdata); + + if (!--call_count) + break; + } + + dict_unref (xdata); + + return 0; +} + + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t refreshfn) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->refreshfn = refreshfn; + + if (local->refreshinode) { + inode_unref (local->refreshinode); + local->refreshinode = NULL; + } + + local->refreshinode = inode_ref (inode); + + afr_inode_refresh_do (frame, this); + + return 0; +} + + +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64 (xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value for %s", + priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_uint64 (xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty " + "query flag"); + } + + return ret; +} + +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) +{ + int ret = -ENOMEM; + + local->xattr_req = dict_new (); + if (!local->xattr_req) + goto out; + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); + + ret = afr_xattr_req_prepare (this, local->xattr_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to prepare xattr_req", loc->path); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); + } + + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); + } + + ret = 0; +out: + return ret; +} + + +int +afr_hash_child (inode_t *inode, int32_t child_count, int hashmode) +{ + uuid_t gfid_copy = {0,}; + pid_t pid; + + if (!hashmode) { + return -1; + } + + if (inode) { + uuid_copy (gfid_copy, inode->gfid); + } + + if (hashmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } + + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} + + +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable) +{ + afr_private_t *priv = NULL; + int read_subvol = -1; + int i = 0; + + priv = this->private; + + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; + + /* second preference - use hashed mode */ + read_subvol = afr_hash_child (inode, priv->child_count, + priv->hash_mode); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; + + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } + + /* no readable subvolumes, either split brain or all subvols down */ + + return -1; +} + + +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type) +{ + int ret = -1; + + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get (inode, this, 0, readable, + event_p); + else + ret = afr_inode_read_subvol_get (inode, this, readable, 0, + event_p); + return ret; +} + + +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; + + priv = this->private; + + readable = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + intersection = alloca0 (priv->child_count); + + afr_inode_read_subvol_type_get (inode, this, readable, &event, type); + + afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, + &event); + + AFR_INTERSECT (intersection, data_readable, metadata_readable, + priv->child_count); + + if (AFR_COUNT (intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy (inode, this, + intersection); + else + subvol = afr_read_subvol_select_by_policy (inode, this, + readable); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + return subvol; +} + + +void +afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + afr_matrix_cleanup (local->pending, priv->child_count); + + GF_FREE (local->internal_lock.locked_nodes); + + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } + + GF_FREE (local->internal_lock.lower_locked_nodes); + + afr_entry_lockee_cleanup (&local->internal_lock); + + GF_FREE (local->transaction.pre_op); + GF_FREE (local->transaction.eager_lock); + GF_FREE (local->transaction.fop_subvols); + GF_FREE (local->transaction.failed_subvols); + + GF_FREE (local->transaction.basename); + GF_FREE (local->transaction.new_basename); + + loc_wipe (&local->transaction.parent_loc); + loc_wipe (&local->transaction.new_parent_loc); + +} + + +void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv) +{ + int i; + + if (!local->replies) + return; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].xdata) { + dict_unref (local->replies[i].xdata); + local->replies[i].xdata = NULL; + } + } + + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); +} + +void +afr_remove_eager_lock_stub (afr_local_t *local) +{ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); +} + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this) +{ + afr_private_t * priv = NULL; + + if (!local) + return; + + syncbarrier_destroy (&local->barrier); + + if (local->transaction.eager_lock_on && + !list_empty (&local->transaction.eager_locked)) + afr_remove_eager_lock_stub (local); + + afr_local_transaction_cleanup (local, this); + + priv = this->private; + + loc_wipe (&local->loc); + loc_wipe (&local->newloc); + + if (local->fd) + fd_unref (local->fd); + + if (local->xattr_req) + dict_unref (local->xattr_req); + + if (local->dict) + dict_unref (local->dict); + + afr_replies_wipe (local, priv); + GF_FREE(local->replies); + + GF_FREE (local->child_up); + + GF_FREE (local->read_attempted); + + GF_FREE (local->readable); + + if (local->inode) + inode_unref (local->inode); + + if (local->parent) + inode_unref (local->parent); + + if (local->parent2) + inode_unref (local->parent2); + + if (local->refreshinode) + inode_unref (local->refreshinode); + + { /* getxattr */ + GF_FREE (local->cont.getxattr.name); + } + + { /* lk */ + GF_FREE (local->cont.lk.locked_nodes); + } + + { /* create */ + if (local->cont.create.fd) + fd_unref (local->cont.create.fd); + if (local->cont.create.params) + dict_unref (local->cont.create.params); + } + + { /* mknod */ + if (local->cont.mknod.params) + dict_unref (local->cont.mknod.params); + } + + { /* mkdir */ + if (local->cont.mkdir.params) + dict_unref (local->cont.mkdir.params); + } + + { /* symlink */ + if (local->cont.symlink.params) + dict_unref (local->cont.symlink.params); + } + + { /* writev */ + GF_FREE (local->cont.writev.vector); + if (local->cont.writev.iobref) + iobref_unref (local->cont.writev.iobref); + } + + { /* setxattr */ + if (local->cont.setxattr.dict) + dict_unref (local->cont.setxattr.dict); + } + + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref (local->cont.fsetxattr.dict); + } + + { /* removexattr */ + GF_FREE (local->cont.removexattr.name); + } + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref (local->cont.xattrop.xattr); + } + { /* fxattrop */ + if (local->cont.fxattrop.xattr) + dict_unref (local->cont.fxattrop.xattr); + } + { /* symlink */ + GF_FREE (local->cont.symlink.linkpath); + } + + { /* opendir */ + GF_FREE (local->cont.opendir.checksum); + } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref (local->cont.readdir.dict); + } + + if (local->xdata_req) + dict_unref (local->xdata_req); + + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); +} + + +int +afr_frame_return (call_frame_t *frame) +{ + afr_local_t *local = NULL; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + return call_count; +} + + +gf_boolean_t +afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this) +{ + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32 (local->replies[i].xdata, + GLUSTERFS_PARENT_ENTRYLK, + &tmp) == 0) + if (tmp) + return _gf_true; + } + + return _gf_false; +} + + +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (call_frame_t *frame, xlator_t *this) +{ + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + uint64_t size = 0; + uint64_t max_size = 0; + int readable_cnt = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT (readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size)) + continue; + if (size > max_size) + max_size = size; + } + + if (!max_size) + return; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size)) + continue; + } +} + + +static void +afr_lookup_done (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + unsigned char *readable = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = {0, }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t can_interpret = _gf_true; + + priv = this->private; + local = frame->local; + replies = local->replies; + + locked_entry = afr_is_entry_possibly_under_txn (local, this); + + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->loc.parent, this, readable, + NULL, &event); + + /* First, check if we have an ESTALE from somewhere, + If so, propagate that so that a revalidate can be + issued + */ + op_errno = afr_final_errno (frame->local, this->private); + local->op_errno = op_errno; + if (op_errno == ESTALE) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } + + read_subvol = -1; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (locked_entry && replies[i].op_ret == -1 && + replies[i].op_errno == ENOENT) { + /* Second, check entry is still + "underway" in creation */ + local->op_ret = -1; + local->op_errno = ENOENT; + read_subvol = i; + goto unwind; + } + + if (replies[i].op_ret == -1) + continue; + + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + uuid_copy (read_gfid, replies[i].poststat.ia_gfid); + local->op_ret = 0; + } + } + + if (read_subvol == -1) + goto unwind; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + if (priv->child_up[i]) + can_interpret = _gf_false; + continue; + } + + if (!uuid_compare (replies[i].poststat.ia_gfid, + read_gfid)) + continue; + + can_interpret = _gf_false; + + if (locked_entry) + continue; + + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; + + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } + + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + if (can_interpret) { + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + if (afr_replies_interpret (frame, this, local->inode)) { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + afr_inode_read_subvol_reset (local->inode, this); + goto cant_interpret; + } else { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + } + } else { + cant_interpret: + if (read_subvol == -1) + dict_del (replies[0].xdata, GF_CONTENT_KEY); + else + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + } + + afr_handle_quota_size (frame, this); + +unwind: + if (read_subvol == -1) + read_subvol = 0; + + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); +} + +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > ENOENT > others + */ + +int +afr_higher_errno (int32_t old_errno, int32_t new_errno) +{ + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; + + return new_errno; +} + + +int +afr_final_errno (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno (op_errno, tmp_errno); + } + + return op_errno; +} + +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size) +{ + char *start = NULL; + char *end = NULL; + int ret = -1; + int i = 0; + + if (!pathinfo) + goto out; + + start = strchr (pathinfo, ':'); + if (!start) + goto out; + end = strrchr (pathinfo, ':'); + if (start == end) + goto out; + + memset (hostname, 0, size); + i = 0; + while (++start != end) + hostname[i++] = *start; + ret = 0; +out: + return ret; +} + +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) +{ + int ret = 0; + char pathinfohost[1024] = {0}; + char localhost[1024] = {0}; + xlator_t *this = THIS; + + *local = _gf_false; + ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", + pathinfo); + goto out; + } + + ret = gethostname (localhost, sizeof (localhost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " + "reason: %s", strerror (errno)); + goto out; + } + + if (!strcmp (localhost, pathinfohost)) + *local = _gf_true; +out: + return ret; +} + +static int32_t +afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; + + if (op_ret != 0) { + goto out; + } + + priv = this->private; + child_index = (int32_t)(long)cookie; + + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; + } + + ret = afr_local_pathinfo (pathinfo, &is_local); + if (ret) { + goto out; + } + + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + gf_log (this->name, GF_LOG_INFO, + "selecting local read_child %s", + priv->children[child_index]->name); + priv->read_child = child_index; + } +out: + STACK_DESTROY(frame->root); + return 0; +} + +static void +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) +{ + call_frame_t *newframe = NULL; + loc_t tmploc = {0,}; + afr_private_t *priv = this->private; + + newframe = create_frame(this,this->ctx->pool); + if (!newframe) { + return; + } + + tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; + STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, + (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->getxattr, + &tmploc, GF_XATTR_PATHINFO_KEY, NULL); +} + + +int +afr_lookup_selfheal_wrap (void *opaque) +{ + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; + + local = frame->local; + this = frame->this; + + afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name); + + afr_replies_wipe (local, this->private); + + inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up); + if (inode) + inode_unref (inode); + afr_lookup_done (frame, this); + + return 0; +} + + +int +afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t need_heal = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; + + local = frame->local; + replies = local->replies; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first == -1) { + first = i; + continue; + } + + if (replies[i].op_ret != replies[first].op_ret) { + need_heal = _gf_true; + break; + } + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first].poststat.ia_gfid)) { + need_heal = _gf_true; + break; + } + } + + if (need_heal) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto lookup_done; + } else { + lookup_done: + afr_lookup_done (frame, this); + } + + return ret; +} + + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) +{ + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; + + child_index = (long) cookie; + + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } + + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_lookup_entry_heal (frame, this); + } + + return 0; +} + + + +static void +afr_discover_done (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + local->op_ret = 0; + } + + op_errno = afr_final_errno (frame->local, this->private); + + if (local->op_ret < 0) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } + + afr_replies_interpret (frame, this, local->inode); + + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); + if (read_subvol == -1) { + gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s", + local->loc.path); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || + local->replies[i].op_ret == -1) + continue; + read_subvol = i; + break; + } + } + +unwind: + if (read_subvol == -1) + read_subvol = 0; + + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); +} + + +int +afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) +{ + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; + + child_index = (long) cookie; + + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } + + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery (this, child_index); + + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_discover_done (frame, this); + } + + return 0; +} + + +int +afr_discover_do (call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err) { + local->op_errno = -err; + ret = -1; + goto out; + } + + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_discover_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; +} + + +int +afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } + + if (__is_root_gfid (loc->inode->gfid)) { + if (!this->itable) + this->itable = loc->inode->table; + if (!priv->root_inode) + priv->root_inode = inode_ref (loc->inode); + + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } + + local->op = GF_FOP_LOOKUP; + + loc_copy (&local->loc, loc); + + local->inode = inode_ref (loc->inode); + + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); + + if (uuid_is_null (loc->inode->gfid)) { + afr_discover_do (frame, this, 0); + return 0; + } + + afr_read_subvol_get (loc->inode, this, NULL, &event, + AFR_DATA_TRANSACTION); + + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->inode, afr_discover_do); + else + afr_discover_do (frame, this, 0); + + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; +} + + +int +afr_lookup_do (call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err < 0) { + local->op_errno = -err; + ret = -1; + goto out; + } + + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } + } + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; +} + +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ + +int +afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; + + if (!loc->parent) { + afr_discover (frame, this, loc, xattr_req); + return 0; + } + + if (__is_root_gfid (loc->parent->gfid)) { + if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { + op_errno = EPERM; + goto out; + } + } + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } + + local->op = GF_FOP_LOOKUP; + + loc_copy (&local->loc, loc); + + local->inode = inode_ref (loc->inode); + + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); + + afr_read_subvol_get (loc->parent, this, NULL, &event, + AFR_DATA_TRANSACTION); + + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->parent, afr_lookup_do); + else + afr_lookup_do (frame, this, 0); + + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; +} + + +/* {{{ open */ + +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + ret = __fd_ctx_get (fd, this, &ctx); + + if (ret < 0) { + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; + + ret = __fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; +out: + return fd_ctx; +} + + +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); + } + UNLOCK(&fd->lock); + + return fd_ctx; +} + + +int +__afr_fd_ctx_set (xlator_t *this, fd_t *fd) +{ + afr_private_t * priv = NULL; + int ret = -1; + uint64_t ctx = 0; + afr_fd_ctx_t * fd_ctx = NULL; + int i = 0; + + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + priv = this->private; + + ret = __fd_ctx_get (fd, this, &ctx); + + if (ret == 0) + goto out; + + fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), + gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } + + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous (fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } + + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_piggyback) { + ret = -ENOMEM; + goto out; + } + + fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_acquired) { + ret = -ENOMEM; + goto out; + } + + pthread_mutex_init (&fd_ctx->delay_lock, NULL); + + INIT_LIST_HEAD (&fd_ctx->eager_locked); + + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + if (ret) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set fd ctx (%p)", fd); +out: + return ret; +} + + +int +afr_fd_ctx_set (xlator_t *this, fd_t *fd) +{ + int ret = -1; + + LOCK (&fd->lock); + { + ret = __afr_fd_ctx_set (this, fd); + } + UNLOCK (&fd->lock); + + return ret; +} + +/* {{{ flush */ + +int +afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (flush, frame, local->op_ret, + local->op_errno, local->xdata_rsp); + + return 0; +} + +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + local->fd, xdata); + if (!--call_count) + break; + + } + } + + return 0; +} + +int +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int op_errno = ENOMEM; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } + + local->fd = fd_ref(fd); + + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) + goto out; + + afr_delayed_changelog_wake_resume (this, fd, stub); + + return 0; +out: + AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); + return 0; +} + +/* }}} */ + + +int +afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) +{ + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = 0; + int i = 0; + + ret = fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + if (fd_ctx) { + //no need to take any locks + if (!list_empty (&fd_ctx->eager_locked)) + gf_log (this->name, GF_LOG_WARNING, "%s: Stale " + "Eager-lock stubs found", + uuid_utoa (fd->inode->gfid)); + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) + GF_FREE (fd_ctx->pre_op_done[i]); + + GF_FREE (fd_ctx->opened_on); + + GF_FREE (fd_ctx->lock_piggyback); + + GF_FREE (fd_ctx->lock_acquired); + + pthread_mutex_destroy (&fd_ctx->delay_lock); + + GF_FREE (fd_ctx); + } + +out: + return 0; +} + + +int +afr_release (xlator_t *this, fd_t *fd) +{ + afr_cleanup_fd_ctx (this, fd); + + return 0; +} + + +/* {{{ fsync */ + +int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int +afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; + int child_index = (long) cookie; + int read_subvol = 0; + call_stub_t *stub = NULL; + + local = frame->local; + + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); + + LOCK (&frame->lock); + { + if (op_ret == 0) { + if (local->op_ret == -1) { + local->op_ret = 0; + + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; + + if (xdata) + local->xdata_rsp = dict_ref (xdata); + } + + if (child_index == read_subvol) { + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + /* Make a stub out of the frame, and register it + with the waking up post-op. When the call-stub resumes, + we are guaranteed that there was no post-op pending + (i.e changelogs were unset in the server). This is an + essential "guarantee", that fsync() returns only after + completely finishing EVERYTHING, including the delayed + post-op. This guarantee is expected by FUSE graph switching + for example. + */ + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); + if (!stub) { + AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_resume (this, local->fd, stub); + } + + return 0; +} + + +int +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + local->fd = fd_ref (fd); + + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } + + local->inode = inode_ref (fd->inode); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fsync_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fsync, + fd, datasync, xdata); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + local->op_ret = 0; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, + local->op_errno, local->xdata_rsp); + + return 0; +} + + +int +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsyncdir_cbk, + priv->children[i], + priv->children[i]->fops->fsyncdir, + fd, datasync, xdata); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + + return 0; +} + +/* }}} */ + +/* {{{ xattrop */ + +int32_t +afr_xattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + if (!local->cont.xattrop.xattr) + local->cont.xattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + + local->op_ret = 0; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, + local->cont.xattrop.xattr, local->xdata_rsp); + + return 0; +} + + +int32_t +afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_xattrop_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + loc, optype, xattr, xdata); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +/* }}} */ + +/* {{{ fxattrop */ + +int32_t +afr_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + if (!local->cont.fxattrop.xattr) + local->cont.fxattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + local->op_ret = 0; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, + local->cont.fxattrop.xattr, local->xdata_rsp); + + return 0; +} + + +int32_t +afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = 0; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fxattrop_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + fd, optype, xattr, xdata); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +/* }}} */ + + +int32_t +afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (inodelk, frame, local->op_ret, + local->op_errno, xdata); + + return 0; +} + + +int32_t +afr_inodelk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOMEM; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_inodelk_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + volume, loc, cmd, flock, xdata); + + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + + return 0; +} + + +int32_t +afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (finodelk, frame, local->op_ret, + local->op_errno, xdata); + + return 0; +} + + +int32_t +afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_finodelk_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + volume, fd, cmd, flock, xdata); + + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + + return 0; +} + + +int32_t +afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (entrylk, frame, local->op_ret, + local->op_errno, xdata); + + return 0; +} + + +int +afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = 0; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_entrylk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + volume, loc, basename, cmd, type, xdata); + + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + + return 0; +} + + + +int +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, + local->op_errno, xdata); + + return 0; +} + + +int +afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fentrylk_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + volume, fd, basename, cmd, type, xdata); + + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + + return 0; +} + + +int +afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = 0; + struct statvfs *buf = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret != 0) { + local->op_errno = op_errno; + goto unlock; + } + + local->op_ret = op_ret; + + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref (xdata); + } + } +unlock: + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, + &local->cont.statfs.buf, local->xdata_rsp); + + return 0; +} + + +int +afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_statfs_cbk, + priv->children[i], + priv->children[i]->fops->statfs, + loc, xdata); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t * local = NULL; + int call_count = -1; + + local = frame->local; + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, + lock, xdata); + + return 0; +} + + +int32_t +afr_lk_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int i = 0; + int call_count = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, + priv->child_count); + + if (call_count == 0) { + AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, NULL); + return 0; + } + + local->call_count = call_count; + + local->cont.lk.user_flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.locked_nodes[i]) { + STACK_WIND (frame, afr_lk_unlock_cbk, + priv->children[i], + priv->children[i]->fops->lk, + local->fd, F_SETLK, + &local->cont.lk.user_flock, NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = -1; +/* int ret = 0; */ + + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { + local->op_ret = -1; + local->op_errno = op_errno; + + afr_lk_unlock (frame, this); + return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + + child_index++; + + if (child_index < priv->child_count) { + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->lk, + local->fd, local->cont.lk.cmd, + &local->cont.lk.user_flock, xdata); + } else if (local->op_ret == -1) { + /* all nodes have gone down */ + + AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, + &local->cont.lk.ret_flock, NULL); + } else { + AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, NULL); + } + + return 0; +} + + +int +afr_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, + sizeof (*local->cont.lk.locked_nodes), + gf_afr_mt_char); + + if (!local->cont.lk.locked_nodes) { + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref (fd); + local->cont.lk.cmd = cmd; + local->cont.lk.user_flock = *flock; + local->cont.lk.ret_flock = *flock; + + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, + priv->children[i], + priv->children[i]->fops->lk, + fd, cmd, flock, xdata); + + return 0; +out: + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +afr_forget (xlator_t *this, inode_t *inode) +{ + return 0; +} + +int +afr_priv_dump (xlator_t *this) +{ + afr_private_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + + GF_ASSERT (this); + priv = this->private; + + GF_ASSERT (priv); + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section(key_prefix); + gf_proc_dump_write("child_count", "%u", priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sprintf (key, "child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->child_up[i]); + sprintf (key, "pending_key[%d]", i); + gf_proc_dump_write(key, "%s", priv->pending_key[i]); + } + gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); + gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); + gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); + gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); + + return 0; +} + + +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ + +static int +find_child_index (xlator_t *this, xlator_t *child) +{ + afr_private_t *priv = NULL; + int i = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if ((xlator_t *) child == priv->children[i]) + break; + } + + return i; +} + +int32_t +afr_notify (xlator_t *this, int32_t event, + void *data, void *data2) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int down_children = 0; + int propagate = 0; + int had_heard_from_all = 0; + int have_heard_from_all = 0; + int idx = -1; + int ret = -1; + int call_psh = 0; + int up_child = -1; + dict_t *input = NULL; + dict_t *output = NULL; + + priv = this->private; + + if (!priv) + return 0; + + /* + * We need to reset this in case children come up in "staggered" + * fashion, so that we discover a late-arriving local subvolume. Note + * that we could end up issuing N lookups to the first subvolume, and + * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. + */ + priv->did_discovery = _gf_false; + + had_heard_from_all = 1; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + had_heard_from_all = 0; + } + } + + /* parent xlators dont need to know about every child_up, child_down + * because of afr ha. If all subvolumes go down, child_down has + * to be triggered. In that state when 1 subvolume comes up child_up + * needs to be triggered. dht optimizes revalidate lookup by sending + * it only to one of its subvolumes. When child up/down happens + * for afr's subvolumes dht should be notified by child_modified. The + * subsequent revalidate lookup happens on all the dht's subvolumes + * which triggers afr self-heals if any. + */ + idx = find_child_index (this, data); + if (idx < 0) { + gf_log (this->name, GF_LOG_ERROR, "Received child_up " + "from invalid subvolume"); + goto out; + } + + switch (event) { + case GF_EVENT_CHILD_UP: + LOCK (&priv->lock); + { + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->up_count++; + priv->event_generation++; + } + priv->child_up[idx] = 1; + + call_psh = 1; + up_child = idx; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + if (up_children == 1) { + gf_log (this->name, GF_LOG_INFO, + "Subvolume '%s' came back up; " + "going online.", ((xlator_t *)data)->name); + } else { + event = GF_EVENT_CHILD_MODIFIED; + } + + priv->last_event[idx] = event; + } + UNLOCK (&priv->lock); + + break; + + case GF_EVENT_CHILD_DOWN: + LOCK (&priv->lock); + { + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->down_count++; + priv->event_generation++; + } + priv->child_up[idx] = 0; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "All subvolumes are down. Going offline " + "until atleast one of them comes back up."); + } else { + event = GF_EVENT_CHILD_MODIFIED; + } + + priv->last_event[idx] = event; + } + UNLOCK (&priv->lock); + + break; + + case GF_EVENT_CHILD_CONNECTING: + LOCK (&priv->lock); + { + priv->last_event[idx] = event; + } + UNLOCK (&priv->lock); + + break; + + case GF_EVENT_TRANSLATOR_OP: + input = data; + output = data2; + if (!had_heard_from_all) { + ret = -1; + goto out; + } + ret = afr_xl_op (this, input, output); + goto out; + break; + + default: + propagate = 1; + break; + } + + /* have all subvolumes reported status once by now? */ + have_heard_from_all = 1; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) + have_heard_from_all = 0; + } + + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) + propagate = 1; + + if (!had_heard_from_all && have_heard_from_all) { + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; + + LOCK (&priv->lock); + { + up_children = AFR_COUNT (priv->child_up, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; + } + + if (priv->last_event[i] == + GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } + } + } + UNLOCK (&priv->lock); + } + + ret = 0; + if (propagate) + ret = default_notify (this, event, data); + + if (call_psh && priv->shd.iamshd) { + afr_selfheal_childup (this, up_child); + } +out: + return ret; +} + + +int +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) +{ + local->op_ret = -1; + local->op_errno = EUCLEAN; + + syncbarrier_init (&local->barrier); + + local->child_up = GF_CALLOC (priv->child_count, + sizeof (*local->child_up), + gf_afr_mt_char); + if (!local->child_up) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + memcpy (local->child_up, priv->child_up, + sizeof (*local->child_up) * priv->child_count); + local->call_count = AFR_COUNT (local->child_up, priv->child_count); + if (local->call_count == 0) { + gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); + if (op_errno) + *op_errno = ENOTCONN; + goto out; + } + local->event_generation = priv->event_generation; + + local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->readable = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + return 0; +out: + return -1; +} + +int +afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, + transaction_lk_type_t lk_type) +{ + int ret = -ENOMEM; + + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + + lk->lower_locked_nodes = GF_CALLOC (sizeof (*lk->lower_locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->lower_locked_nodes) + goto out; + + lk->lock_op_ret = -1; + lk->lock_op_errno = EUCLEAN; + lk->transaction_lk_type = lk_type; + + ret = 0; +out: + return ret; +} + +void +afr_matrix_cleanup (int32_t **matrix, unsigned int m) +{ + int i = 0; + + if (!matrix) + goto out; + for (i = 0; i < m; i++) { + GF_FREE (matrix[i]); + } + + GF_FREE (matrix); +out: + return; +} + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n) +{ + int32_t **matrix = NULL; + int i = 0; + + matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t); + if (!matrix) + goto out; + + for (i = 0; i < m; i++) { + matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n, + gf_afr_mt_int32_t); + if (!matrix[i]) + goto out; + } + return matrix; +out: + afr_matrix_cleanup (matrix, m); + return NULL; +} + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + +int +afr_transaction_local_init (afr_local_t *local, xlator_t *this) +{ + int child_up_count = 0; + int ret = -ENOMEM; + afr_private_t *priv = NULL; + + priv = this->private; + ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, + AFR_TRANSACTION_LK); + if (ret < 0) + goto out; + + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + + ret = -ENOMEM; + child_up_count = AFR_COUNT (local->child_up, priv->child_count); + if (priv->optimistic_change_log && child_up_count == priv->child_count) + local->optimistic_change_log = 1; + + local->pre_op_compat = priv->pre_op_compat; + + local->transaction.eager_lock = + GF_CALLOC (sizeof (*local->transaction.eager_lock), + priv->child_count, + gf_afr_mt_int32_t); + + if (!local->transaction.eager_lock) + goto out; + + local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.pre_op) + goto out; + + local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.fop_subvols) + goto out; + + local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) + goto out; + + local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + INIT_LIST_HEAD (&local->transaction.eager_locked); + + ret = 0; +out: + return ret; +} + + +void +afr_set_low_priority (call_frame_t *frame) +{ + frame->root->pid = LOW_PRIO_PROC_PID; +} + + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv) +{ + unsigned int quorum = 0; + + GF_VALIDATE_OR_GOTO(logname,priv,out); + + quorum = priv->quorum_count; + if (quorum != AFR_QUORUM_AUTO) { + return (priv->up_count >= (priv->down_count + quorum)); + } + + quorum = priv->child_count / 2 + 1; + if (priv->up_count >= (priv->down_count + quorum)) { + return _gf_true; + } + + /* + * Special case for even numbers of nodes: if we have exactly half + * and that includes the first ("senior-most") node, then that counts + * as quorum even if it wouldn't otherwise. This supports e.g. N=2 + * while preserving the critical property that there can only be one + * such group. + */ + if ((priv->child_count % 2) == 0) { + quorum = priv->child_count / 2; + if (priv->up_count >= (priv->down_count + quorum)) { + if (priv->child_up[0]) { + return _gf_true; + } + } + } + +out: + return _gf_false; +} + +void +afr_priv_destroy (afr_private_t *priv) +{ + int i = 0; + + if (!priv) + goto out; + inode_unref (priv->root_inode); + GF_FREE (priv->last_event); + if (priv->pending_key) { + for (i = 0; i < priv->child_count; i++) + GF_FREE (priv->pending_key[i]); + } + GF_FREE (priv->pending_key); + GF_FREE (priv->children); + GF_FREE (priv->child_up); + LOCK_DESTROY (&priv->lock); + + GF_FREE (priv); +out: + return; +} + +int +xlator_subvolume_count (xlator_t *this) +{ + int i = 0; + xlator_list_t *list = NULL; + + for (list = this->children; list; list = list->next) + i++; + return i; +} + + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + + if (!local->fd) + return; + + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) + return; + + fd_ctx->open_fd_count = local->open_fd_count; +} diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index f1638c740..fa1da3958 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -24,6 +15,7 @@ #include <sys/time.h> #include <stdlib.h> #include <signal.h> +#include <string.h> #ifndef _CONFIG_H #define _CONFIG_H @@ -42,293 +34,395 @@ #include "common-utils.h" #include "compat-errno.h" #include "compat.h" +#include "checksum.h" #include "afr.h" +#include "afr-transaction.h" int32_t afr_opendir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd, dict_t *xdata) { - afr_local_t * local = NULL; + afr_local_t *local = NULL; + int call_count = -1; + int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; - int call_count = -1; + local = frame->local; + fd_ctx = local->fd_ctx; + child_index = (long) cookie; - LOCK (&frame->lock); - { - local = frame->local; + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } + } + UNLOCK (&frame->lock); - if (op_ret == 0) - local->op_ret = 0; + call_count = afr_frame_return (frame); - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + if (call_count == 0) + AFR_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd, NULL); + return 0; +} - call_count = afr_frame_return (frame); - if (call_count == 0) { - AFR_STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - } +int +afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + int i = 0; + int call_count = -1; + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; + + loc_copy (&local->loc, loc); + + local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; + + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_opendir_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + loc, fd, NULL); + + if (!--call_count) + break; + } + } return 0; +out: + AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); + return 0; } -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int child_count = 0; - int i = 0; +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 - int ret = -1; - int call_count = -1; +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) - int32_t op_ret = -1; - int32_t op_errno = 0; +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +static uint64_t +afr_bits_for (uint64_t num) +{ + uint64_t bits = 0, ctrl = 1; - priv = this->private; + while (ctrl < num) { + ctrl *= 2; + bits ++; + } - child_count = priv->child_count; + return bits; +} - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; +int +afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p) +{ + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } + + conf = this->private; + if (!conf) + goto out; + + max = conf->child_count; + cnt = subvol; + + if (max == 1) { + y = x; goto out; } - frame->local = local; - local->fd = fd_ref (fd); + max_bits = afr_bits_for (max); - call_count = local->call_count; - - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_opendir_cbk, - priv->children[i], - priv->children[i]->fops->opendir, - loc, fd); + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); - if (!--call_count) - break; - } - } + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); + } - op_ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); - } + if (y_p) + *y_p = y; - return 0; + return 0; } -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: readdir - */ - -int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) +int +afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p, + uint64_t *x_p) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + int subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; + + if (!this->private) + return -1; + + conf = this->private; + max = conf->child_count; + + if (max == 1) { + x = y; + cnt = 0; + goto out; + } - gf_dirent_t * entry = NULL; + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = afr_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); - int child_index = -1; + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; - priv = this->private; - children = priv->children; + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } - local = frame->local; +out: + subvol = cnt; - child_index = (long) cookie; + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; - if (op_ret != -1) { - list_for_each_entry (entry, &entries->list, list) { - entry->d_ino = afr_itransform (entry->d_ino, - priv->child_count, - child_index); + return 0; +} + + +static void +afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, + gf_dirent_t *entries, fd_t *fd) +{ + afr_private_t *priv = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int gen = 0; + + priv = THIS->private; + + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + continue; } - } - AFR_STACK_UNWIND (frame, op_ret, op_errno, entries); + list_del_init (&entry->list); + afr_itransform (THIS, subvol, entry->d_off, &entry->d_off); + list_add_tail (&entry->list, &entries->list); - return 0; + if (entry->inode) { + gen = 0; + afr_inode_read_subvol_get (entry->inode, THIS, + data_readable, + metadata_readable, &gen); + + if (gen != priv->event_generation || + !data_readable[subvol] || + !metadata_readable[subvol]) { + + inode_unref (entry->inode); + entry->inode = NULL; + } + } + } } int32_t -afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) +afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + gf_dirent_t entries; - int ret = -1; + INIT_LIST_HEAD (&entries.list); - int32_t op_ret = -1; - int32_t op_errno = 0; + local = frame->local; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - children = priv->children; + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; } - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - local->cont.readdir.offset = offset; + if (op_ret >= 0) + afr_readdir_transform_entries (subvol_entries, (long) cookie, + &entries, local->fd); - STACK_WIND_COOKIE (frame, afr_readdir_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdir, - fd, size, offset); + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - return 0; + return 0; } -int32_t -afr_getdents_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dir_entry_t *entry, int32_t count) +int +afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - int unwind = 1; - int last_tried = -1; - int this_try = -1; + priv = this->private; + local = frame->local; - priv = this->private; - children = priv->children; + if (subvol == -1) { + AFR_STACK_UNWIND (readdir, frame, local->op_ret, + local->op_errno, 0, 0); + return 0; + } - local = frame->local; + if (local->op == GF_FOP_READDIR) + STACK_WIND_COOKIE (frame, afr_readdir_cbk, + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); + else + STACK_WIND_COOKIE (frame, afr_readdir_cbk, + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); + return 0; +} - if (op_ret == -1) { - last_tried = local->cont.getdents.last_tried; - if (all_tried (last_tried, priv->child_count)) { - goto out; - } +int +afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; - this_try = ++local->cont.getdents.last_tried; - unwind = 0; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - STACK_WIND (frame, afr_getdents_cbk, - children[this_try], - children[this_try]->fops->getdents, - local->fd, local->cont.getdents.size, - local->cont.getdents.offset, local->cont.getdents.flag); + local->op = whichop; + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict)? dict_ref (dict) : NULL; + + if (offset == 0) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn (frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_deitransform (this, offset, &subvol, + (uint64_t *)&local->cont.readdir.offset); + afr_readdir_wind (frame, this, subvol); } + return 0; out: - if (unwind) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count); - } - - return 0; + AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); + return 0; } int32_t -afr_getdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int32_t flag) +afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); - priv = this->private; - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up."); - goto out; - } + return 0; +} - local->cont.getdents.last_tried = call_child; - local->fd = fd_ref (fd); +int32_t +afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) +{ + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); - local->cont.getdents.size = size; - local->cont.getdents.offset = offset; - local->cont.getdents.flag = flag; - - frame->local = local; + return 0; +} - STACK_WIND (frame, afr_getdents_cbk, - children[call_child], children[call_child]->fops->getdents, - fd, size, offset, flag); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } +int32_t +afr_releasedir (xlator_t *this, fd_t *fd) +{ + afr_cleanup_fd_ctx (this, fd); - return 0; + return 0; } - - diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 6d981fdfd..09456d159 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_READ_H__ @@ -23,25 +14,23 @@ int32_t afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd); + loc_t *loc, fd_t *fd, dict_t *xdata); int32_t -afr_closedir (call_frame_t *frame, xlator_t *this, - fd_t *fd); +afr_releasedir (xlator_t *this, fd_t *fd); int32_t afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, dict_t *xdata); int32_t -afr_getdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int32_t flag); - +afr_readdirp (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, dict_t *dict); int32_t afr_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags); + loc_t *loc, int32_t flags, dict_t *xdata); #endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index b07ac3c48..465dde54f 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -43,1880 +34,1446 @@ #include "common-utils.h" #include "compat-errno.h" #include "compat.h" +#include "byte-order.h" #include "afr.h" #include "afr-transaction.h" - void -afr_build_parent_loc (loc_t *parent, loc_t *child) -{ - char *tmp = NULL; - - if (!child->parent) { - loc_copy (parent, child); - return; - } - - tmp = strdup (child->path); - parent->path = strdup (dirname (tmp)); - FREE (tmp); - - parent->name = strrchr (parent->path, '/'); - if (parent->name) - parent->name++; - - parent->inode = inode_ref (child->parent); - parent->parent = inode_parent (parent->inode, 0, NULL); - parent->ino = parent->inode->ino; -} - -/* {{{ create */ +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this); int -afr_create_unwind (call_frame_t *frame, xlator_t *this) +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + int ret = -1; + char *child_path = NULL; - local = frame->local; + if (!child->parent) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + child_path = gf_strdup (child->path); + if (!child_path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - if (main_frame) { - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - local->cont.create.fd, - local->cont.create.inode, - &local->cont.create.buf); + parent->path = gf_strdup (dirname (child_path)); + if (!parent->path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; } - - return 0; -} + parent->inode = inode_ref (child->parent); + uuid_copy (parent->gfid, child->pargfid); -int -afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct stat *buf) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + ret = 0; +out: + GF_FREE (child_path); - int ret = 0; + return ret; +} - int call_count = -1; - int child_index = -1; - local = frame->local; - priv = this->private; +static void +__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int inode_read_subvol = -1; + int parent_read_subvol = -1; + int parent2_read_subvol = -1; + int i = 0; - child_index = (long) cookie; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set ctx on fd=%p", fd); - - local->op_ret = -1; - local->op_errno = -ret; - } - - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.create.buf = *buf; - local->cont.create.buf.st_ino = - afr_itransform (buf->st_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->read_child_index) { - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - local->cont.create.inode = inode; - - local->success_count++; + if (local->inode) { + afr_replies_interpret (frame, this, local->inode); + inode_read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + if (local->parent) + parent_read_subvol = afr_data_subvol_get (local->parent, this, + NULL, NULL); + if (local->parent2) + parent2_read_subvol = afr_data_subvol_get (local->parent2, this, + NULL, NULL); + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) + afr_inode_read_subvol_reset (local->inode, + this); + if (local->parent) + afr_inode_read_subvol_reset (local->parent, + this); + if (local->parent2) + afr_inode_read_subvol_reset (local->parent2, + this); + continue; } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + if (local->op_ret == -1) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.dir_fop.buf = + local->replies[i].poststat; + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + if (local->replies[i].xdata) + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + continue; + } - call_count = afr_frame_return (frame); + if (i == inode_read_subvol) { + local->cont.dir_fop.buf = + local->replies[i].poststat; + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } - if (call_count == 0) { - local->transaction.unwind (frame, this); + if (i == parent_read_subvol) { + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + } - local->transaction.resume (frame, this); + if (i == parent2_read_subvol) { + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + } } - - return 0; } -int -afr_create_wind (call_frame_t *frame, xlator_t *this) +static void +__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *poststat, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (poststat) + local->replies[child_index].poststat = *poststat; + if (preparent) + local->replies[child_index].preparent = *preparent; + if (postparent) + local->replies[child_index].postparent = *postparent; + if (preparent2) + local->replies[child_index].preparent2 = *preparent2; + if (postparent2) + local->replies[child_index].postparent2 = *postparent2; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + if (op_errno != ENOTEMPTY) + afr_transaction_fop_failed (frame, this, child_index); + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } - int call_count = -1; - int i = 0; + return; +} - local = frame->local; - priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); +static int +__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local = frame->local; - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->create, - &local->loc, - local->cont.create.flags, - local->cont.create.mode, - local->cont.create.fd); - if (!--call_count) - break; - } + LOCK (&frame->lock); + { + __afr_dir_write_fill (frame, this, child_index, op_ret, + op_errno, buf, preparent, postparent, + preparent2, postparent2, xdata); } - - return 0; -} + UNLOCK (&frame->lock); + call_count = afr_frame_return (frame); + if (call_count == 0) { + __afr_dir_write_finalize (frame, this); -int -afr_create_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); - local->transaction.unwind (frame, this); + afr_mark_entry_pending_changelog (frame, this); - AFR_STACK_DESTROY (frame); + local->transaction.resume (frame, this); + } - return 0; + return 0; } int -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + int call_count = 0; - int ret = -1; + call_count = afr_frame_return (frame); - int op_ret = -1; - int op_errno = 0; + if (call_count == 0) + AFR_STACK_DESTROY (frame); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + return 0; +} - priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); +void +afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *new_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *new_local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int32_t **changelog = NULL; + int i = 0; + int idx = 0; + int op_errno = ENOMEM; + unsigned char *pending = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + new_frame = copy_frame (frame); + if (!new_frame) + goto out; + + new_local = AFR_FRAME_INIT (new_frame, op_errno); + if (!new_local) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + xattr = dict_new (); + if (!xattr) goto out; - } - transaction_frame->local = local; + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - loc_copy (&local->loc, loc); + pending = alloca0 (priv->child_count); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->cont.create.flags = flags; - local->cont.create.mode = mode; - local->cont.create.fd = fd_ref (fd); - - local->transaction.fop = afr_create_wind; - local->transaction.done = afr_create_done; - local->transaction.unwind = afr_create_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + !local->transaction.failed_subvols[i]) { + call_count ++; + continue; + } - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + changelog[i][idx] = hton32(1); + pending[i] = 1; } - return 0; -} + new_local->pending = changelog; + uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); + new_local->loc.inode = inode_ref (local->inode); -/* }}} */ -/* {{{ mknod */ + afr_set_pending_dict (priv, xattr, changelog); -int -afr_mknod_unwind (call_frame_t *frame, xlator_t *this) -{ - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + new_local->call_count = call_count; - local = frame->local; + for (i = 0; i < priv->child_count; i++) { + if (pending[i]) + continue; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - local->cont.mknod.inode, - &local->cont.mknod.buf); + STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->xattrop, + &new_local->loc, GF_XATTROP_ADD_ARRAY, + xattr, NULL); + if (!--call_count) + break; } - return 0; + new_frame = NULL; +out: + if (new_frame) + AFR_STACK_DESTROY (new_frame); + if (xattr) + dict_unref (xattr); + return; } -int -afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf) +void +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_count = 0; + int failed_count = 0; - child_index = (long) cookie; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0){ - local->cont.mknod.buf = *buf; - local->cont.mknod.buf.st_ino = - afr_itransform (buf->st_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } + if (local->op_ret < 0) + return; - if (child_index == local->read_child_index) { - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - local->cont.mknod.inode = inode; - - local->success_count++; - } + if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD) + return; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + failed_count = AFR_COUNT (local->transaction.failed_subvols, + priv->child_count); - call_count = afr_frame_return (frame); + if (pre_op_count == priv->child_count && !failed_count) + return; - if (call_count == 0) { - local->transaction.unwind (frame, this); + afr_mark_new_entry_changelog (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + return; } -int32_t -afr_mknod_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; +/* {{{ create */ - int call_count = -1; - int i = 0; +int +afr_create_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - call_count = afr_up_children_count (priv->child_count, local->child_up); + main_frame = afr_transaction_detach_fop_frame (frame); - if (call_count == 0) { - local->transaction.resume (frame, this); + if (!main_frame) return 0; - } - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev); - if (!--call_count) - break; - } - } - - return 0; + AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, local->inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } int -afr_mknod_done (call_frame_t *frame, xlator_t *this) +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - afr_local_t * local = NULL; - - local = frame->local; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); +} - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); - return 0; +int +afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->create, + &local->loc, local->cont.create.flags, + local->cont.create.mode, local->umask, + local->cont.create.fd, local->xdata_req); + return 0; } int -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev) +afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - int ret = -1; + priv = this->private; - int op_ret = -1; - int op_errno = 0; + QUORUM_CHECK(create,out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + loc_copy (&local->loc, loc); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local->fd_ctx = afr_fd_ctx_get (fd, this); + if (!local->fd_ctx) goto out; - } - - transaction_frame->local = local; - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); - local->cont.mknod.mode = mode; - local->cont.mknod.dev = dev; + local->op = GF_FOP_CREATE; + local->cont.create.flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref (fd); + local->umask = umask; - local->transaction.fop = afr_mknod_wind; - local->transaction.done = afr_mknod_done; - local->transaction.unwind = afr_mknod_unwind; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); + if (!local->xdata_req) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + local->transaction.wind = afr_create_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_create_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } /* }}} */ -/* {{{ mkdir */ - +/* {{{ mknod */ int -afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +afr_mknod_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + local = frame->local; - if (main_frame) { - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - local->cont.mkdir.inode, - &local->cont.mkdir.buf); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - return 0; + AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } int -afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf) +afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.mkdir.buf = *buf; - local->cont.mkdir.buf.st_ino = - afr_itransform (buf->st_ino, priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->read_child_index) { - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - local->cont.mkdir.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, local->cont.mkdir.mode); - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev, local->umask, + local->xdata_req); + return 0; } - int -afr_mkdir_done (call_frame_t *frame, xlator_t *this) +afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) { - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - AFR_STACK_DESTROY (frame); - - return 0; -} + priv = this->private; + QUORUM_CHECK(mknod,out); -int -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - int ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - int op_ret = -1; - int op_errno = 0; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->op = GF_FOP_MKNOD; + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + local->umask = umask; - priv = this->private; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (!local->xdata_req) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { + local->transaction.wind = afr_mknod_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_mknod_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { op_errno = -ret; goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); } - UNLOCK (&priv->read_child_lock); - - local->cont.mkdir.mode = mode; - - local->transaction.fop = afr_mkdir_wind; - local->transaction.done = afr_mkdir_done; - local->transaction.unwind = afr_mkdir_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - - return 0; + AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ -/* {{{ link */ +/* {{{ mkdir */ int -afr_link_unwind (call_frame_t *frame, xlator_t *this) +afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - if (main_frame) { - local->cont.link.buf.st_ino = local->cont.link.ino; + local = frame->local; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - local->cont.link.inode, - &local->cont.link.buf); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - return 0; + AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } int -afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) +afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.link.buf = *buf; - local->cont.link.buf.st_ino = - afr_itransform (buf->st_ino, priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->read_child_index) { - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - local->cont.link.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_link_wind (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->link, - &local->loc, - &local->newloc); - - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mkdir, &local->loc, + local->cont.mkdir.mode, local->umask, + local->xdata_req); + return 0; } int -afr_link_done (call_frame_t *frame, xlator_t *this) +afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - return 0; -} + priv = this->private; + QUORUM_CHECK(mkdir,out); -int -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - int ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - int op_ret = -1; - int op_errno = 0; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->cont.mkdir.mode = mode; + local->umask = umask; - priv = this->private; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (!local->xdata_req) goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { + local->op = GF_FOP_MKDIR; + local->transaction.wind = afr_mkdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_mkdir_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { op_errno = -ret; goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); } - UNLOCK (&priv->read_child_lock); - - local->cont.link.ino = oldloc->inode->ino; - - local->transaction.fop = afr_link_wind; - local->transaction.done = afr_link_done; - local->transaction.unwind = afr_link_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ -/* {{{ symlink */ +/* {{{ link */ int -afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +afr_link_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - local->cont.symlink.inode, - &local->cont.symlink.buf); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - return 0; + AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } int -afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) +afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.symlink.buf = *buf; - local->cont.symlink.buf.st_ino = - afr_itransform (buf->st_ino, priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->read_child_index) { - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - local->cont.symlink.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_symlink_wind (call_frame_t *frame, xlator_t *this) +afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - local->cont.symlink.linkpath, - &local->loc); - - if (!--call_count) - break; - - } - } - - return 0; + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->link, + &local->loc, &local->newloc, local->xdata_req); + return 0; } int -afr_symlink_done (call_frame_t *frame, xlator_t *this) +afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - AFR_STACK_DESTROY (frame); - - return 0; -} + priv = this->private; + QUORUM_CHECK(link,out); -int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - int ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - int op_ret = -1; - int op_errno = 0; + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (newloc->parent); - priv = this->private; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (!local->xdata_req) goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { + local->op = GF_FOP_LINK; + + local->transaction.wind = afr_link_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_link_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { op_errno = -ret; goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); } - UNLOCK (&priv->read_child_lock); - - local->cont.symlink.ino = loc->inode->ino; - local->cont.symlink.linkpath = strdup (linkpath); - - local->transaction.fop = afr_symlink_wind; - local->transaction.done = afr_symlink_done; - local->transaction.unwind = afr_symlink_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ -/* {{{ rename */ +/* {{{ symlink */ + int -afr_rename_unwind (call_frame_t *frame, xlator_t *this) +afr_symlink_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - local->cont.rename.buf.st_ino = local->cont.rename.ino; - - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.rename.buf); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - return 0; + AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } int -afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if ((op_ret != -1) && (local->success_count == 0)) { - local->op_ret = op_ret; - - if (buf) { - local->cont.rename.buf = *buf; - local->cont.rename.buf.st_ino = - afr_itransform (buf->st_ino, priv->child_count, - child_index); - } - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_rename_wind (call_frame_t *frame, xlator_t *this) +int +afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rename, - &local->loc, - &local->newloc); - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->symlink, + local->cont.symlink.linkpath, &local->loc, + local->umask, local->xdata_req); + return 0; } int -afr_rename_done (call_frame_t *frame, xlator_t *this) +afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - afr_local_t * local = frame->local; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local->transaction.unwind (frame, this); + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; -} + QUORUM_CHECK(symlink,out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; -int -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - int op_ret = -1; - int op_errno = 0; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->cont.symlink.linkpath = gf_strdup (linkpath); + local->umask = umask; - priv = this->private; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (!local->xdata_req) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { + local->op = GF_FOP_SYMLINK; + local->transaction.wind = afr_symlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_symlink_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { op_errno = -ret; goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); - - local->cont.rename.ino = oldloc->inode->ino; - - local->transaction.fop = afr_rename_wind; - local->transaction.done = afr_rename_done; - local->transaction.unwind = afr_rename_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); + return 0; } /* }}} */ -/* {{{ unlink */ +/* {{{ rename */ int -afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +afr_rename_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - return 0; + AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, local->xdata_rsp); + return 0; } int -afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); } -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) +int +afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + priv = this->private; - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->unlink, - &local->loc); - - if (!--call_count) - break; - } - } - - return 0; + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rename, + &local->loc, &local->newloc, local->xdata_req); + return 0; } -int32_t -afr_unlink_done (call_frame_t *frame, xlator_t *this) +int +afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_local_t * local = frame->local; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + int nlockee = 0; - local->transaction.unwind (frame, this); + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; -} + QUORUM_CHECK(rename,out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + op_errno = ENOMEM; -int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - int op_ret = -1; - int op_errno = 0; + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (oldloc->parent); + local->parent2 = inode_ref (newloc->parent); - priv = this->private; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (!local->xdata_req) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local->op = GF_FOP_RENAME; + local->transaction.wind = afr_rename_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_rename_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, + &op_errno); + if (ret) + goto out; + ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->newloc, + NULL, + priv->child_count); + if (ret) + goto out; + + nlockee++; + } + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + if (ret < 0) { op_errno = -ret; goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, loc); - - local->transaction.fop = afr_unlink_wind; - local->transaction.done = afr_unlink_done; - local->transaction.unwind = afr_unlink_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } /* }}} */ -/* {{{ rmdir */ - - +/* {{{ unlink */ int -afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +afr_unlink_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + local = frame->local; - if (main_frame) - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - return 0; + AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } int -afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) - need_unwind = 1; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rmdir, - &local->loc); + local = frame->local; + priv = this->private; - if (!--call_count) - break; - } - } - - return 0; + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->unlink, + &local->loc, local->xflag, local->xdata_req); + return 0; } int -afr_rmdir_done (call_frame_t *frame, xlator_t *this) +afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - afr_local_t * local = frame->local; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local->transaction.unwind (frame, this); + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; -} + QUORUM_CHECK(unlink,out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; -int -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - int op_ret = -1; - int op_errno = 0; + loc_copy (&local->loc, loc); + local->xflag = xflag; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); - priv = this->private; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (!local->xdata_req) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { + local->op = GF_FOP_UNLINK; + local->transaction.wind = afr_unlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_unlink_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { op_errno = -ret; goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, loc); - - local->transaction.fop = afr_rmdir_wind; - local->transaction.done = afr_rmdir_done; - local->transaction.unwind = afr_rmdir_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ -/* {{{ setdents */ - -int32_t -afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; +/* {{{ rmdir */ - int call_count = -1; - int child_index = (long) cookie; - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if ((op_ret != -1) && (local->success_count == 0)) { - local->op_ret = op_ret; - local->success_count++; - } +int +afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local = frame->local; - call_count = afr_frame_return (frame); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } -int32_t -afr_setdents_wind (call_frame_t *frame, xlator_t *this) +int +afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setdents, - local->fd, local->cont.setdents.flags, - local->cont.setdents.entries, - local->cont.setdents.count); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_setdents_done (call_frame_t *frame, xlator_t *this) +int +afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rmdir, + &local->loc, local->cont.rmdir.flags, local->xdata_req); + return 0; } -int32_t -afr_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count) +int +afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int ret = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + int nlockee = 0; - int op_ret = -1; - int op_errno = 0; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + QUORUM_CHECK(rmdir,out); - ALLOC_OR_GOTO (local, afr_local_t, out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - frame->local = local; - local->fd = fd_ref (fd); + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); - local->cont.setdents.flags = flags; - local->cont.setdents.entries = entries; - local->cont.setdents.count = count; + local->cont.rmdir.flags = flags; - local->transaction.fop = afr_setdents_wind; - local->transaction.done = afr_setdents_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.basename = NULL; + if (!local->xdata_req) + goto out; - afr_transaction (frame, this, AFR_ENTRY_TRANSACTION); + local->op = GF_FOP_RMDIR; + local->transaction.wind = afr_rmdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_rmdir_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->loc, + NULL, + priv->child_count); + if (ret) + goto out; + + nlockee++; + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; + + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h index 76b4b60fa..02f0a3682 100644 --- a/xlators/cluster/afr/src/afr-dir-write.h +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_WRITE_H__ @@ -22,38 +13,35 @@ int32_t afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, fd_t *fd); + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *xdata); int32_t afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev); + loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); int32_t afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode); + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); int32_t afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, int xflag, dict_t *xdata); int32_t afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, int flags, dict_t *xdata); int32_t afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int32_t afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); -int32_t +int afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *oldloc); - -int32_t -afr_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params); #endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 4819d06cd..4cb219246 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -44,832 +35,1630 @@ #include "compat-errno.h" #include "compat.h" -#include "afr.h" +#include "afr-transaction.h" -/** - * Common algorithm for inode read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: access, stat, fstat, readlink, getxattr - */ - /* {{{ access */ -int32_t -afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +int +afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + local = frame->local; - priv = this->private; - children = priv->children; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - local = frame->local; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - read_child = (long) cookie; + AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); - if (op_ret == -1) { - retry: - last_tried = local->cont.access.last_tried; + return 0; +} - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.access.last_tried; - if (this_try == read_child) { - goto retry; - } +int +afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - unwind = 0; + priv = this->private; + local = frame->local; - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->access, - &local->loc, local->cont.access.mask); + if (subvol == -1) { + AFR_STACK_UNWIND (access, frame, local->op_ret, + local->op_errno, 0); + return 0; } -out: - if (unwind) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } + STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->access, + &local->loc, local->cont.access.mask, + local->xdata_req); + return 0; +} + +int +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, + int mask, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_ACCESS; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref (xdata); + + afr_read_txn (frame, this, loc->inode, afr_access_wind, + AFR_METADATA_TRANSACTION); return 0; +out: + AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + + return 0; } +/* }}} */ -int32_t -afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) +/* {{{ stat */ + +int +afr_stat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iatt *buf, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; + afr_local_t *local = NULL; - int32_t read_child = -1; + local = frame->local; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - int32_t op_ret = -1; - int32_t op_errno = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + return 0; +} - children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); +int +afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - read_child = afr_read_child (this, loc->inode); + priv = this->private; + local = frame->local; - if (read_child >= 0) { - call_child = read_child; + if (subvol == -1) { + AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } - local->cont.access.last_tried = -1; + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->stat, + &local->loc, local->xdata_req); + return 0; +} - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } +int +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - local->cont.access.last_tried = call_child; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_STAT; loc_copy (&local->loc, loc); - local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->access, - loc, mask); + afr_read_txn (frame, this, loc->inode, afr_stat_wind, + AFR_DATA_TRANSACTION); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } return 0; +out: + AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + + return 0; } /* }}} */ -/* {{{ stat */ +/* {{{ fstat */ -int32_t -afr_stat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) +int +afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + local = frame->local; - priv = this->private; - children = priv->children; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - read_child = (long) cookie; - - local = frame->local; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - if (op_ret == -1) { - retry: - last_tried = local->cont.stat.last_tried; + AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.stat.last_tried; - - if (this_try == read_child) { - goto retry; - } + return 0; +} - unwind = 0; - STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->stat, - &local->loc); - } +int +afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; -out: - if (unwind) { - if (op_ret != -1) - buf->st_ino = local->cont.stat.ino; + priv = this->private; + local = frame->local; - AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + if (subvol == -1) { + AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; } + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fstat, + local->fd, local->xdata_req); return 0; } int32_t -afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc) +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; + int op_errno = 0; - int32_t read_child = -1; - int call_child = 0; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - int32_t op_ret = -1; - int32_t op_errno = 0; + local->op = GF_FOP_FSTAT; + local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_ref (xdata); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_fix_open (fd, this); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + afr_read_txn (frame, this, fd->inode, afr_fstat_wind, + AFR_DATA_TRANSACTION); - children = priv->children; + return 0; +out: + AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); - ALLOC_OR_GOTO (local, afr_local_t, out); + return 0; +} - frame->local = local; +/* }}} */ - read_child = afr_read_child (this, loc->inode); +/* {{{ readlink */ - if (read_child >= 0) { - call_child = read_child; +int +afr_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + const char *buf, struct iatt *sbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; - local->cont.stat.last_tried = -1; + local = frame->local; - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - local->cont.stat.last_tried = call_child; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; } - loc_copy (&local->loc, loc); + AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, + buf, sbuf, xdata); + return 0; +} - local->cont.stat.ino = loc->inode->ino; +int +afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->stat, - loc); + local = frame->local; + priv = this->private; - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + if (subvol == -1) { + AFR_STACK_UNWIND (readlink, frame, local->op_ret, + local->op_errno, 0, 0, 0); + return 0; } + STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readlink, + &local->loc, local->cont.readlink.size, + local->xdata_req); return 0; } -/* }}} */ +int +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size, dict_t *xdata) +{ + afr_local_t * local = NULL; + int32_t op_errno = 0; -/* {{{ fstat */ + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; -int32_t -afr_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + local->op = GF_FOP_READLINK; + loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + if (xdata) + local->xdata_req = dict_ref (xdata); - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + afr_read_txn (frame, this, loc->inode, afr_readlink_wind, + AFR_DATA_TRANSACTION); - priv = this->private; - children = priv->children; + return 0; +out: + AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); - local = frame->local; + return 0; +} - read_child = (long) cookie; - if (op_ret == -1) { - retry: - last_tried = local->cont.fstat.last_tried; +/* }}} */ - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.fstat.last_tried; +/* {{{ getxattr */ - if (this_try == read_child) { - goto retry; - } +struct _xattr_key { + char *key; + struct list_head list; +}; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->fstat, - local->fd); - } +int +__gather_xattr_keys (dict_t *dict, char *key, data_t *value, + void *data) +{ + struct list_head * list = data; + struct _xattr_key * xkey = NULL; -out: - if (unwind) { - if (op_ret != -1) - buf->st_ino = local->cont.fstat.ino; + if (!strncmp (key, AFR_XATTR_PREFIX, + strlen (AFR_XATTR_PREFIX))) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); - } + xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); + if (!xkey) + return -1; - return 0; + xkey->key = key; + INIT_LIST_HEAD (&xkey->list); + + list_add_tail (&xkey->list, list); + } + return 0; } -int32_t -afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) +void +afr_filter_xattrs (dict_t *dict) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + struct list_head keys = {0,}; + struct _xattr_key *key = NULL; + struct _xattr_key *tmp = NULL; - int call_child = 0; - int32_t read_child = -1; + INIT_LIST_HEAD (&keys); - int32_t op_ret = -1; - int32_t op_errno = 0; + dict_foreach (dict, __gather_xattr_keys, + (void *) &keys); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); + list_for_each_entry_safe (key, tmp, &keys, list) { + dict_del (dict, key->key); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + list_del_init (&key->list); - children = priv->children; + GF_FREE (key); + } +} - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; +int +afr_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; - VALIDATE_OR_GOTO (fd->inode, out); + local = frame->local; - read_child = afr_read_child (this, fd->inode); + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (read_child >= 0) { - call_child = read_child; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - local->cont.fstat.last_tried = -1; - } else { - call_child = afr_first_up_child (priv); + if (dict) + afr_filter_xattrs (dict); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); - local->cont.fstat.last_tried = call_child; - } + return 0; +} - local->cont.fstat.ino = fd->inode->ino; - local->fd = fd_ref (fd); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fstat, - fd); +int +afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; } + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->getxattr, + &local->loc, local->cont.getxattr.name, + local->xdata_req); return 0; } -/* }}} */ -/* {{{ readlink */ +int32_t +afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, + dict_t *dict, dict_t *xdata) + +{ + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} int32_t -afr_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *buf) +afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + unwind: + // Updating child_errno with more recent 'events' + op_errno = afr_final_errno (local, priv); - priv = this->private; - children = priv->children; + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + if (xattr) + dict_unref (xattr); + } - local = frame->local; + return ret; +} - read_child = (long) cookie; +int32_t +afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } - if (op_ret == -1) { - retry: - last_tried = local->cont.readlink.last_tried; + unwind: + // Updating child_errno with more recent 'events' + op_errno = afr_final_errno (local, priv); - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.readlink.last_tried; + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); - if (this_try == read_child) { - goto retry; - } + if (xattr) + dict_unref (xattr); + } - unwind = 0; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->readlink, - &local->loc, - local->cont.readlink.size); - } + return ret; +} -out: - if (unwind) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); - } +/** + * node-uuid cbk uses next child querying mechanism + */ +int32_t +afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int curr_call_child = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { /** query the _next_ child */ + + /** + * _current_ becomes _next_ + * If done with all childs and yet no success; give up ! + */ + curr_call_child = (int) ((long)cookie); + if (++curr_call_child == priv->child_count) + goto unwind; + + gf_log (this->name, GF_LOG_WARNING, + "op_ret (-1): Re-querying afr-child (%d/%d)", + curr_call_child, priv->child_count); + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) curr_call_child, + children[curr_call_child], + children[curr_call_child]->fops->getxattr, + &local->loc, + local->cont.getxattr.name, + NULL); + } - return 0; -} + unwind: + if (unwind) + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, + NULL); + return 0; +} int32_t -afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) +afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; - int32_t read_child = -1; + LOCK (&frame->lock); + { + local = frame->local; - int32_t op_ret = -1; - int32_t op_errno = 0; + call_cnt = --local->call_count; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } - children = priv->children; + if (!dict) { + goto unlock; + } - ALLOC_OR_GOTO (local, afr_local_t, out); + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); - frame->local = local; + if (!lockinfo_buf) { + goto unlock; + } - read_child = afr_read_child (this, loc->inode); + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } - if (read_child >= 0) { - call_child = read_child; + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } - local->cont.readlink.last_tried = -1; + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - } else { - call_child = afr_first_up_child (priv); + len = dict_serialized_length (local->dict); + if (len == 0) { + goto unwind; + } - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; } - local->cont.readlink.last_tried = call_child; - } + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } - loc_copy (&local->loc, loc); - local->cont.readlink.size = size; + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->readlink, - loc, size); + unwind: + AFR_STACK_UNWIND (getxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - return 0; + dict_unref (lockinfo); + + return 0; } +int32_t +afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; -/* }}} */ + LOCK (&frame->lock); + { + local = frame->local; -/* {{{ getxattr */ + call_cnt = --local->call_count; -struct _xattr_key { - char *key; - struct list_head list; -}; + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } -void -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) -{ - struct list_head * list = data; - struct _xattr_key * xkey = NULL; + if (!dict) { + goto unlock; + } - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); - xkey = CALLOC (1, sizeof (*xkey)); - if (!xkey) - return; + if (!lockinfo_buf) { + goto unlock; + } - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } - list_add_tail (&xkey->list, list); + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); } -} + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } -void -__filter_xattrs (dict_t *dict) + len = dict_serialized_length (local->dict); + if (len <= 0) { + goto unwind; + } + + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } + + dict_unref (lockinfo); + + return 0; +} + +int32_t +afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - struct list_head keys; + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } - struct _xattr_key *key; - struct _xattr_key *tmp; + local = frame->local; + cky = (long) cookie; - INIT_LIST_HEAD (&keys); + LOCK (&frame->lock); + { + callcnt = --local->call_count; - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); + if (!dict || (op_ret < 0)) + goto out; - list_del_init (&key->list); + if (!local->dict) + local->dict = dict_new (); + + if (local->dict) { + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); + if (ret) + goto out; + + xattr = gf_strdup (xattr); + + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); + goto out; + } + + local->cont.getxattr.xattr_len + += strlen (xattr) + 1; + } + } +out: + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new (); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen (this->name) + + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + + strlen (xattr_serz), + &tlen, ' '); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); + goto unwind; + } + + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); - FREE (key); + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, + local->op_errno, nxattr, local->xdata_rsp); + + if (nxattr) + dict_unref (nxattr); } + + return ret; } +int32_t +afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } + + if (!dict || (op_ret < 0)) + goto out; + + if (!local->dict) + local->dict = dict_new (); + + if (local->dict) { + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); + if (ret) + goto out; + + xattr = gf_strdup (xattr); + + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); + goto out; + } + + local->cont.getxattr.xattr_len += strlen (xattr) + 1; + } + } + out: + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new (); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + strlen (xattr_serz), + &tlen, ' '); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); + goto unwind; + } -int32_t -afr_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); + + unwind: + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, nxattr, local->xdata_rsp); + + if (nxattr) + dict_unref (nxattr); + } + + return ret; +} + +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + int ret = 0; - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_max_stime (THIS, data, key, value); - priv = this->private; - children = priv->children; + return ret; +} - local = frame->local; +int32_t +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; - read_child = (long) cookie; + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } - if (op_ret == -1) { - retry: - last_tried = local->cont.getxattr.last_tried; + local = frame->local; - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.getxattr.last_tried; + LOCK (&frame->lock); + { + callcnt = --local->call_count; - if (this_try == read_child) { - goto retry; + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; } - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->getxattr, - &local->loc, - local->cont.getxattr.name); - } + if (!local->dict) + local->dict = dict_copy_with_ref (dict, NULL); + else + dict_foreach (dict, afr_aggregate_stime_xattr, + local->dict); + local->op_ret = 0; + } + +cleanup: + UNLOCK (&frame->lock); + + if (!callcnt) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, local->dict, xdata); + } out: - if (unwind) { - __filter_xattrs (dict); + return 0; +} - AFR_STACK_UNWIND (frame, op_ret, op_errno, dict); - } - return 0; +static gf_boolean_t +afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, + gf_boolean_t is_fgetxattr) +{ + gf_boolean_t is_spl = _gf_true; + + GF_ASSERT (cbk); + if (!cbk || !name) { + is_spl = _gf_false; + goto out; + } + + if (!strcmp (name, GF_XATTR_PATHINFO_KEY) || + !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_pathinfo_cbk; + } else { + *cbk = afr_getxattr_pathinfo_cbk; + } + } else if (!strncmp (name, GF_XATTR_CLRLK_CMD, + strlen (GF_XATTR_CLRLK_CMD))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_clrlk_cbk; + } else { + *cbk = afr_getxattr_clrlk_cbk; + } + } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_lockinfo_cbk; + } else { + *cbk = afr_getxattr_lockinfo_cbk; + } + } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; + } else { + is_spl = _gf_false; + } + +out: + return is_spl; } +static void +afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame, + const char *name, loc_t *loc, + fop_getxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + //local->call_count set in afr_local_init + call_count = local->call_count; + + //If up-children count is 0, afr_local_init would have failed already + //and the call would have unwound so not handling it here. + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->getxattr, + loc, name, NULL); + if (!--call_count) + break; + } + } + return; +} int32_t afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t * local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + afr_local_t *local = NULL; + xlator_list_t *trav = NULL; + xlator_t **sub_volumes = NULL; + int i = 0; + int32_t op_errno = 0; + int ret = -1; + fop_getxattr_cbk_t cbk = NULL; + int afr_xtime_gauge[MCNT_MAX] = {0,}; - int read_child = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + children = priv->children; - children = priv->children; + loc_copy (&local->loc, loc); - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + local->op = GF_FOP_GETXATTR; - if (name) { - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { + if (xdata) + local->xdata_req = dict_ref (xdata); - op_errno = ENODATA; - goto out; - } + if (!name) + goto no_name; + + local->cont.getxattr.name = gf_strdup (name); + + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + + if (!strncmp (name, AFR_XATTR_PREFIX, + strlen (AFR_XATTR_PREFIX))) { + gf_log (this->name, GF_LOG_INFO, + "%s: no data present for key %s", + loc->path, name); + op_errno = ENODATA; + goto out; } + if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { - read_child = afr_read_child (this, loc->inode); + local->marker.call_count = priv->child_count; - if (read_child >= 0) { - call_child = read_child; + sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); + for (i = 0, trav = this->children; trav ; + trav = trav->next, i++) { - local->cont.getxattr.last_tried = -1; - } else { - call_child = afr_first_up_child (priv); + *(sub_volumes + i) = trav->xlator; + } - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); + if (cluster_getmarkerattr (frame, this, loc, name, + local, afr_getxattr_unwind, + sub_volumes, + priv->child_count, + MARKER_UUID_TYPE, + marker_uuid_default_gauge, + priv->vol_uuid)) { + + gf_log (this->name, GF_LOG_INFO, + "%s: failed to get marker attr (%s)", + loc->path, name); + op_errno = EINVAL; goto out; } - local->cont.getxattr.last_tried = call_child; + return 0; } - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = strdup (name); + /* + * if we are doing getxattr with pathinfo as the key then we + * collect information from all childs + */ + if (afr_is_special_xattr (name, &cbk, 0)) { + afr_getxattr_all_subvols (this, frame, name, loc, cbk); + return 0; + } - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->getxattr, - loc, name); + if (XATTR_IS_NODE_UUID (name)) { + i = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) i, + children[i], + children[i]->fops->getxattr, + loc, name, xdata); + return 0; + } - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - return 0; -} + if (*priv->vol_uuid) { + if ((match_uuid_local (name, priv->vol_uuid) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + local->marker.call_count = priv->child_count; + + sub_volumes = alloca ( priv->child_count + * sizeof (xlator_t *)); + for (i = 0, trav = this->children; trav ; + trav = trav->next, i++) { + + *(sub_volumes + i) = trav->xlator; + + } + + /* don't err out on getting ENOTCONN (brick down) + * from a subset of the bricks + */ + memcpy (afr_xtime_gauge, marker_xtime_default_gauge, + sizeof (afr_xtime_gauge)); + afr_xtime_gauge[MCNT_NOTFOUND] = 0; + afr_xtime_gauge[MCNT_ENOTCONN] = 0; + if (cluster_getmarkerattr (frame, this, loc, + name, local, + afr_getxattr_unwind, + sub_volumes, + priv->child_count, + MARKER_XTIME_TYPE, + afr_xtime_gauge, + priv->vol_uuid)) { + gf_log (this->name, GF_LOG_INFO, + "%s: failed to get marker attr (%s)", + loc->path, name); + op_errno = EINVAL; + goto out; + } + + return 0; + } + } +no_name: -/* }}} */ + afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind, + AFR_METADATA_TRANSACTION); -/* {{{ readv */ + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} + +/* {{{ fgetxattr */ -/** - * read algorithm: - * - * if the user has specified a read subvolume, use it - * otherwise - - * use the inode number to hash it to one of the subvolumes, and - * read from there (to balance read load) - * - * if any of the above read's fail, try the children in sequence - * beginning at the beginning - */ int32_t -afr_readv_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct stat *buf, - struct iobref *iobref) +afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + local = frame->local; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if (dict) + afr_filter_xattrs (dict); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); - children = priv->children; + return 0; +} + +int +afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + local->xdata_req); + return 0; +} + + +static void +afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame, + fop_fgetxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + //local->call_count set in afr_local_init + call_count = local->call_count; + + //If up-children count is 0, afr_local_init would have failed already + //and the call would have unwound so not handling it here. + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + NULL); + if (!--call_count) + break; + } + } - read_child = (long) cookie; + return; +} + + +int +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + fop_fgetxattr_cbk_t cbk = NULL; - if (op_ret == -1) { - retry: - last_tried = local->cont.readv.last_tried; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - if (all_tried (last_tried, priv->child_count)) { + local->op = GF_FOP_FGETXATTR; + local->fd = fd_ref (fd); + if (name) { + local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; goto out; } - this_try = ++local->cont.readv.last_tried; - - if (this_try == read_child) { - /* - skip the read child since if we are here - we must have already tried that child - */ - goto retry; - } + } + if (xdata) + local->xdata_req = dict_ref (xdata); + + /* pathinfo gets handled only in getxattr(), but we need to handle + * lockinfo. + * If we are doing fgetxattr with lockinfo as the key then we + * collect information from all children. + */ + if (afr_is_special_xattr (name, &cbk, 1)) { + afr_fgetxattr_all_subvols (this, frame, cbk); + return 0; + } - unwind = 0; + afr_fix_open (fd, this); - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset); - } + afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind, + AFR_METADATA_TRANSACTION); + return 0; out: - if (unwind) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf, - iobref); - } + AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } -int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) +/* }}} */ + +/* {{{ readv */ + +int +afr_readv_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct iatt *buf, + struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; - int32_t read_child = -1; - int call_child = 0; + local = frame->local; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - int32_t op_ret = -1; - int32_t op_errno = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); + AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, + vector, count, buf, iobref, xdata); + return 0; +} - priv = this->private; - children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); +int +afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - frame->local = local; + local = frame->local; + priv = this->private; - read_child = afr_read_child (this, fd->inode); + if (subvol == -1) { + AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno, + 0, 0, 0, 0, 0); + return 0; + } - if (read_child >= 0) { - call_child = read_child; + STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset, local->cont.readv.flags, + local->xdata_req); + return 0; +} - /* - if read fails from the read child, we try - all children starting with the first one - */ - local->cont.readv.last_tried = -1; - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } +int +afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + afr_local_t * local = NULL; + int32_t op_errno = 0; - local->cont.readv.last_tried = call_child; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->fd = fd_ref (fd); + local->op = GF_FOP_READ; + local->fd = fd_ref (fd); + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + if (xdata) + local->xdata_req = dict_ref (xdata); - local->cont.readv.size = size; - local->cont.readv.offset = offset; + afr_fix_open (fd, this); - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readv, - fd, size, offset); + afr_read_txn (frame, this, fd->inode, afr_readv_wind, + AFR_DATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL, - NULL); - } + AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index 1127cb652..e4091a793 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_READ_H__ @@ -22,26 +13,30 @@ int32_t afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask); + loc_t *loc, int32_t mask, dict_t *xdata); int32_t afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size); + loc_t *loc, size_t size, dict_t *xdata); int32_t afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata); int32_t afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 57fdf20d5..00e0d2676 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -48,1965 +39,1713 @@ #include "afr-transaction.h" -/* {{{ chmod */ - - -int -afr_chmod_unwind (call_frame_t *frame, xlator_t *this) +static void +__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = 0; + int i = 0; local = frame->local; - priv = this->private; + priv = this->private; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; + if (local->inode) { + if (local->transaction.type == AFR_METADATA_TRANSACTION) + read_subvol = afr_metadata_subvol_get (local->inode, this, + NULL, NULL); + else + read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); } - UNLOCK (&frame->lock); - if (main_frame) { - local->cont.chmod.buf.st_ino = local->cont.chmod.ino; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.chmod.buf); - } - return 0; -} - - -int -afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.chmod.buf = *buf; - } - local->success_count++; + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + afr_inode_read_subvol_reset (local->inode, this); + continue; + } - if (local->success_count == priv->wait_count) { - need_unwind = 1; + /* Order of checks in the compound conditional + below is important. + + - Highest precedence: largest op_ret + - Next precendence: if all op_rets are equal, read subvol + - Least precedence: any succeeded subvol + */ + if ((local->op_ret < local->replies[i].op_ret) || + ((local->op_ret == local->replies[i].op_ret) && + (i == read_subvol))) { + + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.inode_wfop.prebuf = + local->replies[i].prestat; + local->cont.inode_wfop.postbuf = + local->replies[i].poststat; + + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); } } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - afr_chmod_unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); } - - return 0; } -int -afr_chmod_wind (call_frame_t *frame, xlator_t *this) +static void +__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int i = 0; - int call_count = -1; + afr_local_t *local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - call_count = afr_up_children_count (priv->child_count, local->child_up); + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; + if (op_ret >= 0) { + if (prebuf) + local->replies[child_index].prestat = *prebuf; + if (postbuf) + local->replies[child_index].poststat = *postbuf; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } else { + afr_transaction_fop_failed (frame, this, child_index); } - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->chmod, - &local->loc, - local->cont.chmod.mode); - - if (!--call_count) - break; - } - } - - return 0; + return; } -int -afr_chmod_done (call_frame_t *frame, xlator_t *this) +static int +__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; - local = frame->local; + local = frame->local; - local->transaction.unwind (frame, this); + LOCK (&frame->lock); + { + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); + } + UNLOCK (&frame->lock); - AFR_STACK_DESTROY (frame); - - return 0; -} + call_count = afr_frame_return (frame); + if (call_count == 0) { + __afr_inode_write_finalize (frame, this); -int32_t -afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); - transaction_frame->local = local; + local->transaction.resume (frame, this); + } - local->cont.chmod.mode = mode; - local->cont.chmod.ino = loc->inode->ino; - - local->transaction.fop = afr_chmod_wind; - local->transaction.done = afr_chmod_done; - local->transaction.unwind = afr_chmod_unwind; - - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; - - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + return 0; +} - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } +/* {{{ writev */ - return 0; +void +afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) +{ + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; + + src_local = src_frame->local; + dst_local = dst_frame->local; + + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; + if (src_local->xdata_rsp) + dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp); } -/* }}} */ - +void +afr_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + local = frame->local; + + AFR_STACK_UNWIND (writev, frame, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); +} -/* {{{ fchmod */ int -afr_fchmod_unwind (call_frame_t *frame, xlator_t *this) +afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - priv = this->private; + call_frame_t *fop_frame = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + fop_frame = afr_transaction_detach_fop_frame (frame); - if (main_frame) { - local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.fchmod.buf); - } - return 0; + if (fop_frame) { + afr_writev_copy_outvars (frame, fop_frame); + afr_writev_unwind (fop_frame, this); + } + return 0; } +static void +afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || + (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } +} int -afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fchmod.buf = *buf; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; + int child_index = (long) cookie; + int call_count = -1; + int ret = 0; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); + if (op_ret == -1 || !xdata) + goto unlock; + + write_is_append = 0; + ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + + ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if (ret == -1) + goto unlock; + if ((open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; } + } +unlock: + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write (this, local->fd); + + __afr_inode_write_finalize (frame, this); + + afr_writev_handle_short_writes (frame, this); + + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!afr_txn_nothing_failed (frame, this)) { + //Don't unwind until post-op is complete + local->transaction.resume (frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); + local->transaction.resume (frame, this); + afr_writev_unwind (fop_frame, this); + } + } + return 0; +} - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - afr_fchmod_unwind (frame, this); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; +int +afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->writev, + local->fd, local->cont.writev.vector, + local->cont.writev.count, local->cont.writev.offset, + local->cont.writev.flags, local->cont.writev.iobref, + local->xdata_req); + return 0; } int -afr_fchmod_wind (call_frame_t *frame, xlator_t *this) +afr_do_writev (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int i = 0; - int call_count = -1; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; - priv = this->private; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - call_count = afr_up_children_count (priv->child_count, local->child_up); + local = frame->local; + transaction_frame->local = local; + frame->local = NULL; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + if (!AFR_FRAME_INIT (frame, op_errno)) + goto out; + + local->op = GF_FOP_WRITE; + + local->transaction.wind = afr_writev_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_transaction_writev_unwind; + + local->transaction.main_frame = frame; + + if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency gurantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = local->cont.writev.offset; + local->transaction.len = iov_length (local->cont.writev.vector, + local->cont.writev.count); + } + + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fchmod, - local->fd, - local->cont.fchmod.mode); - - if (!--call_count) - break; - } - } - return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int -afr_fchmod_done (call_frame_t *frame, xlator_t *this) +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) { - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int op_errno = ENOMEM; - AFR_STACK_DESTROY (frame); - - return 0; -} + priv = this->private; + QUORUM_CHECK(writev,out); -int32_t -afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - int ret = -1; + local->cont.writev.vector = iov_dup (vector, count); + if (!local->cont.writev.vector) + goto out; + local->cont.writev.count = count; + local->cont.writev.offset = offset; + local->cont.writev.flags = flags; + local->cont.writev.iobref = iobref_ref (iobref); - int op_ret = -1; - int op_errno = 0; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if (!local->xdata_req) + goto out; - priv = this->private; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; goto out; } - transaction_frame->local = local; + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; - local->cont.fchmod.mode = mode; - local->cont.fchmod.ino = fd->inode->ino; + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - local->transaction.fop = afr_fchmod_wind; - local->transaction.done = afr_fchmod_done; - local->transaction.unwind = afr_fchmod_unwind; + afr_fix_open (fd, this); - local->fd = fd_ref (fd); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + afr_do_writev (frame, this); - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } + /* }}} */ -/* {{{ chown */ +/* {{{ truncate */ int -afr_chown_unwind (call_frame_t *frame, xlator_t *this) +afr_truncate_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - local->cont.chown.buf.st_ino = local->cont.chown.ino; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.chown.buf); - } - return 0; + AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } int -afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; + afr_local_t *local = NULL; local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.chown.buf = *buf; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) { - local->transaction.unwind (frame, this); - } + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } int -afr_chown_wind (call_frame_t *frame, xlator_t *this) +afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->chown, - &local->loc, local->cont.chown.uid, - local->cont.chown.gid); - - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->truncate, + &local->loc, local->cont.truncate.offset, + local->xdata_req); + return 0; } int -afr_chown_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid) +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; + int op_errno = ENOMEM; - int op_ret = -1; - int op_errno = 0; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + QUORUM_CHECK(truncate,out); - priv = this->private; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local->cont.truncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + if (!local->xdata_req) goto out; - } - transaction_frame->local = local; + local->transaction.wind = afr_truncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_truncate_unwind; - local->cont.chown.uid = uid; - local->cont.chown.gid = gid; - local->cont.chown.ino = loc->inode->ino; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); - local->transaction.fop = afr_chown_wind; - local->transaction.done = afr_chown_done; - local->transaction.unwind = afr_chown_unwind; + local->op = GF_FOP_TRUNCATE; - loc_copy (&local->loc, loc); + local->transaction.main_frame = frame; + local->transaction.start = offset; + local->transaction.len = 0; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + /* Set it true speculatively, will get reset in afr_truncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ -/* {{{ chown */ - -int -afr_fchown_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - local->cont.fchown.buf.st_ino = local->cont.fchown.ino; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.fchown.buf); - } - return 0; -} +/* {{{ ftruncate */ int -afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fchown.buf = *buf; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local = frame->local; - if (need_unwind) { - local->transaction.unwind (frame, this); - } - - call_count = afr_frame_return (frame); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } int -afr_fchown_wind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; local = frame->local; - priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fchown, - local->fd, local->cont.fchown.uid, - local->cont.fchown.gid); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } int -afr_fchown_done (call_frame_t *frame, xlator_t *this) +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset, + local->xdata_req); + return 0; } int -afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid) +afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; + int op_errno = ENOMEM; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; + QUORUM_CHECK(ftruncate,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (!frame) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local->cont.ftruncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) goto out; - } - transaction_frame->local = local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - local->cont.fchown.uid = uid; - local->cont.fchown.gid = gid; - local->cont.fchown.ino = fd->inode->ino; + local->op = GF_FOP_FTRUNCATE; - local->transaction.fop = afr_fchown_wind; - local->transaction.done = afr_fchown_done; - local->transaction.unwind = afr_fchown_unwind; + local->transaction.wind = afr_ftruncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_ftruncate_unwind; - local->fd = fd_ref (fd); + local->transaction.main_frame = frame; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.start = local->cont.ftruncate.offset; + local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + afr_fix_open (fd, this); - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } + /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; + + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } return 0; +out: + AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; } /* }}} */ -/* {{{ writev */ +/* {{{ setattr */ int -afr_writev_unwind (call_frame_t *frame, xlator_t *this) +afr_setattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - local->cont.writev.buf.st_ino = local->cont.writev.ino; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.writev.buf); - } - return 0; + AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); + return 0; } int -afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); +} - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - local = frame->local; - priv = this->private; +int +afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setattr, + &local->loc, &local->cont.setattr.in_buf, + local->cont.setattr.valid, local->xdata_req); + return 0; +} - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.writev.buf = *buf; - } - local->success_count++; +int +afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } + priv = this->private; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + QUORUM_CHECK(setattr,out); - call_count = afr_frame_return (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - if (call_count == 0) { - local->transaction.unwind (frame, this); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - local->transaction.resume (frame, this); - } - - return 0; -} + local->cont.setattr.in_buf = *buf; + local->cont.setattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + if (!local->xdata_req) + goto out; -int -afr_writev_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int i = 0; - int call_count = -1; + local->transaction.wind = afr_setattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_setattr_unwind; - local = frame->local; - priv = this->private; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); - call_count = afr_up_children_count (priv->child_count, local->child_up); + local->op = GF_FOP_SETATTR; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - local->fd, - local->cont.writev.vector, - local->cont.writev.count, - local->cont.writev.offset, - local->cont.writev.iobref); - - if (!--call_count) - break; - } - } - return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } +/* {{{ fsetattr */ int -afr_writev_done (call_frame_t *frame, xlator_t *this) +afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - iobref_unref (local->cont.writev.iobref); - local->cont.writev.iobref = NULL; + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - local->transaction.unwind (frame, this); + AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} - AFR_STACK_DESTROY (frame); - return 0; +int +afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); } int -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) +afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetattr, + local->fd, &local->cont.fsetattr.in_buf, + local->cont.fsetattr.valid, local->xdata_req); + return 0; +} - int ret = -1; - int op_ret = -1; - int op_errno = 0; +int +afr_fsetattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; + QUORUM_CHECK(fsetattr,out); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local->cont.fsetattr.in_buf = *buf; + local->cont.fsetattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + if (!local->xdata_req) goto out; - } - transaction_frame->local = local; + local->transaction.wind = afr_fsetattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsetattr_unwind; - local->op = GF_FOP_WRITE; - local->cont.writev.vector = iov_dup (vector, count); - local->cont.writev.count = count; - local->cont.writev.offset = offset; - local->cont.writev.ino = fd->inode->ino; - local->cont.writev.iobref = iobref_ref (iobref); + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - local->transaction.fop = afr_writev_wind; - local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_writev_unwind; + local->op = GF_FOP_FSETATTR; - local->fd = fd_ref (fd); + afr_fix_open (fd, this); - local->transaction.main_frame = frame; - if (fd->flags & O_APPEND) { - local->transaction.start = 0; - local->transaction.len = 0; - } else { - local->transaction.start = offset; - local->transaction.len = iov_length (vector, count); - } + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } -/* }}} */ +/* {{{ setxattr */ -/* {{{ truncate */ int -afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - local->cont.truncate.buf.st_ino = local->cont.truncate.ino; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.truncate.buf); - } - return 0; + AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } int -afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.buf = *buf; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } -int32_t -afr_truncate_wind (call_frame_t *frame, xlator_t *this) +int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->truncate, - &local->loc, - local->cont.truncate.offset); - - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setxattr, + &local->loc, local->cont.setxattr.dict, + local->cont.setxattr.flags, local->xdata_req); + return 0; } int -afr_truncate_done (call_frame_t *frame, xlator_t *this) +afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = EINVAL; - local = frame->local; + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); - local->transaction.unwind (frame, this); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); - AFR_STACK_DESTROY (frame); + priv = this->private; - return 0; -} + QUORUM_CHECK(setxattr,out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; -int -afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - int ret = -1; + local->cont.setxattr.dict = dict_ref (dict); + local->cont.setxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - int op_ret = -1; - int op_errno = 0; + if (!local->xdata_req) + goto out; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->transaction.wind = afr_setxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_setxattr_unwind; - priv = this->private; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ALLOC_OR_GOTO (local, afr_local_t, out); + local->op = GF_FOP_SETXATTR; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { op_errno = -ret; goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; - - local->cont.truncate.offset = offset; - local->cont.truncate.ino = loc->inode->ino; - - local->transaction.fop = afr_truncate_wind; - local->transaction.done = afr_truncate_done; - local->transaction.unwind = afr_truncate_unwind; - - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = offset; - - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } + } return 0; -} +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); -/* }}} */ + return 0; +} -/* {{{ ftruncate */ +/* {{{ fsetxattr */ int -afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.ftruncate.buf); - } - return 0; + AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } int -afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.buf = *buf; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - local->fd, local->cont.ftruncate.offset); - - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetxattr, + local->fd, local->cont.fsetxattr.dict, + local->cont.fsetxattr.flags, local->xdata_req); + return 0; } int -afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - return 0; -} - - -int -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); - int ret = -1; + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); - int op_ret = -1; - int op_errno = 0; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + QUORUM_CHECK(fsetxattr,out); - priv = this->private; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local->cont.fsetxattr.dict = dict_ref (dict); + local->cont.fsetxattr.flags = flags; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - - local->op = GF_FOP_FTRUNCATE; - local->op_ret = -1; + if (!local->xdata_req) + goto out; - local->cont.ftruncate.offset = offset; - local->cont.ftruncate.ino = fd->inode->ino; + local->transaction.wind = afr_fsetxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsetxattr_unwind; - local->transaction.fop = afr_ftruncate_wind; - local->transaction.done = afr_ftruncate_done; - local->transaction.unwind = afr_ftruncate_unwind; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - local->fd = fd_ref (fd); + local->op = GF_FOP_FSETXATTR; - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = offset; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - return 0; + AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + return 0; } /* }}} */ -/* {{{ utimens */ + +/* {{{ removexattr */ int -afr_utimens_unwind (call_frame_t *frame, xlator_t *this) +afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - local->cont.utimens.buf.st_ino = local->cont.utimens.ino; - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, - &local->cont.utimens.buf); - } - return 0; + AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } int -afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 1; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.utimens.buf = *buf; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } int -afr_utimens_wind (call_frame_t *frame, xlator_t *this) +afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->utimens, - &local->loc, - local->cont.utimens.tv); - - if (!--call_count) - break; - } - } - - return 0; + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->removexattr, + &local->loc, local->cont.removexattr.name, + local->xdata_req); + return 0; } int -afr_utimens_done (call_frame_t *frame, xlator_t *this) +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) { - afr_local_t * local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); - local->transaction.unwind (frame, this); + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); - AFR_STACK_DESTROY (frame); + priv = this->private; - return 0; -} + QUORUM_CHECK(removexattr,out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; -int -afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - int ret = -1; + local->cont.removexattr.name = gf_strdup (name); - int op_ret = -1; - int op_errno = 0; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if (!local->xdata_req) + goto out; - priv = this->private; + local->transaction.wind = afr_removexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_removexattr_unwind; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_REMOVEXATTR; - ALLOC_OR_GOTO (local, afr_local_t, out); + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { op_errno = -ret; goto out; - } + } - transaction_frame->local = local; - - local->op_ret = -1; + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - local->cont.utimens.tv[0] = tv[0]; - local->cont.utimens.tv[1] = tv[1]; + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + return 0; +} - local->cont.utimens.ino = loc->inode->ino; +/* ffremovexattr */ +int +afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local->transaction.fop = afr_utimens_wind; - local->transaction.done = afr_utimens_done; - local->transaction.unwind = afr_utimens_unwind; + local = frame->local; - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - return 0; +int +afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } -/* }}} */ -/* {{{ setxattr */ +int +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fremovexattr, + local->fd, local->cont.removexattr.name, + local->xdata_req); + return 0; +} int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; - priv = this->private; + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); - if (main_frame) { - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) - } - return 0; -} + priv = this->private; + QUORUM_CHECK(fremovexattr, out); -int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - int call_count = -1; - int need_unwind = 0; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - local = frame->local; - priv = this->private; + local->cont.removexattr.name = gf_strdup (name); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; + if (!local->xdata_req) + goto out; - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } + local->transaction.wind = afr_fremovexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fremovexattr_unwind; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - if (need_unwind) - local->transaction.unwind (frame, this); + local->op = GF_FOP_FREMOVEXATTR; - call_count = afr_frame_return (frame); + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - if (call_count == 0) { - local->transaction.resume (frame, this); - } - return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); + + return 0; } int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); + local = frame->local; - if (call_count == 0) { - local->transaction.resume (frame, this); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) return 0; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags); - - if (!--call_count) - break; - } - } - - return 0; + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = frame->local; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); - - return 0; +int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fallocate, + local->fd, local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, local->xdata_req); + return 0; } int -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - - int ret = -1; + afr_private_t *priv = NULL; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - int op_ret = -1; - int op_errno = 0; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + QUORUM_CHECK(fallocate,out); - priv = this->private; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame->local = local; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->op_ret = -1; + if (!local->xdata_req) + goto out; - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; + local->op = GF_FOP_FALLOCATE; - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; + local->transaction.wind = afr_fallocate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fallocate_unwind; - loc_copy (&local->loc, loc); + local->transaction.main_frame = frame; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + afr_fix_open (fd, this); - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } -/* }}} */ -/* {{{ removexattr */ +/* }}} */ +/* {{{ discard */ int -afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +afr_discard_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) - } - return 0; + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } int -afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} - int call_count = -1; - int need_unwind = 0; - local = frame->local; - priv = this->private; +int +afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->discard, + local->fd, local->cont.discard.offset, + local->cont.discard.len, local->xdata_req); + return 0; +} - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + priv = this->private; - if (need_unwind) - local->transaction.unwind (frame, this); + QUORUM_CHECK(discard, out); - call_count = afr_frame_return (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; + local->cont.discard.offset = offset; + local->cont.discard.len = len; -int32_t -afr_removexattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - int call_count = -1; - int i = 0; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local = frame->local; - priv = this->private; + if (!local->xdata_req) + goto out; - call_count = afr_up_children_count (priv->child_count, local->child_up); + local->op = GF_FOP_DISCARD; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local->transaction.wind = afr_discard_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_discard_unwind; - local->call_count = call_count; + local->transaction.main_frame = frame; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, - local->cont.removexattr.name); + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; + + afr_fix_open (fd, this); + + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - if (!--call_count) - break; - } - } - return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } +/* {{{ zerofill */ + int -afr_removexattr_done (call_frame_t *frame, xlator_t *this) +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local->transaction.unwind (frame, this); + local = frame->local; - AFR_STACK_DESTROY (frame); - - return 0; + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } int -afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) +afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} - int ret = -1; - int op_ret = -1; - int op_errno = 0; +int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->zerofill, + local->fd, local->cont.zerofill.offset, + local->cont.zerofill.len, local->xdata_req); + return 0; +} + +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + priv = this->private; - priv = this->private; + QUORUM_CHECK(discard, out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (!transaction_frame) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - transaction_frame->local = local; + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; - local->op_ret = -1; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->cont.removexattr.name = strdup (name); + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; - local->transaction.unwind = afr_removexattr_unwind; + local->op = GF_FOP_ZEROFILL; - loc_copy (&local->loc, loc); + local->transaction.wind = afr_zerofill_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_zerofill_unwind; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + local->transaction.start = local->cont.discard.offset; + local->transaction.len = len; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } + afr_fix_open (fd, this); + + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index 358d25b52..7b1fc5528 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_WRITE_H__ @@ -22,43 +13,70 @@ int32_t afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode); + loc_t *loc, mode_t mode, dict_t *xdata); int32_t afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid); + loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata); int afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid); + fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata); int32_t afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode); + fd_t *fd, mode_t mode, dict_t *xdata); int32_t -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref); + uint32_t flags, struct iobref *iobref, dict_t *xdata); int32_t afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset); + loc_t *loc, off_t offset, dict_t *xdata); int32_t afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset); + fd_t *fd, off_t offset, dict_t *xdata); int32_t afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]); + loc_t *loc, struct timespec tv[2], dict_t *xdata); + +int +afr_setattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata); + +int +afr_fsetattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata); int32_t afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags); + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); + +int32_t +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); int32_t afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); +int32_t +afr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c new file mode 100644 index 000000000..a2a758f35 --- /dev/null +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -0,0 +1,1667 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dict.h" +#include "byte-order.h" +#include "common-utils.h" + +#include "afr.h" +#include "afr-transaction.h" + +#include <signal.h> + + +#define LOCKED_NO 0x0 /* no lock held */ +#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ +#define LOCKED_LOWER 0x2 /* for lower path */ + +#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_out (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_out (frame, this, params); \ + } while (0); + +int +afr_entry_lockee_cmp (const void *l1, const void *l2) +{ + const afr_entry_lockee_t *r1 = l1; + const afr_entry_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid ((loc_t*)&r1->loc, gfid1); + loc_gfid ((loc_t*)&r2->loc, gfid2); + ret = uuid_compare (gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp (r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; +} + +int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); + +static int +afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); + +static uint64_t afr_lock_number = 1; + +static uint64_t +get_afr_lock_number () +{ + return (++afr_lock_number); +} + +int +afr_set_lock_number (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + int_lock->lock_number = get_afr_lock_number (); + + return 0; +} + +void +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) +{ + gf_log (this->name, GF_LOG_TRACE, + "Setting lk-owner=%llu", + (unsigned long long) (unsigned long)lk_owner); + + set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); +} + +static int +is_afr_lock_selfheal (afr_local_t *local) +{ + afr_internal_lock_t *int_lock = NULL; + int ret = -1; + + int_lock = &local->internal_lock; + + switch (int_lock->selfheal_lk_type) { + case AFR_DATA_SELF_HEAL_LK: + case AFR_METADATA_SELF_HEAL_LK: + ret = 1; + break; + case AFR_ENTRY_SELF_HEAL_LK: + ret = 0; + break; + } + + return ret; + +} + +int32_t +internal_lock_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int32_t call_count = 0; + int i = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; + } + + return call_count; +} + +static void +afr_print_inodelk (char *str, int size, int cmd, + struct gf_flock *flock, gf_lkowner_t *owner) +{ + char *cmd_str = NULL; + char *type_str = NULL; + + switch (cmd) { +#if F_GETLK != F_GETLK64 + case F_GETLK64: +#endif + case F_GETLK: + cmd_str = "GETLK"; + break; + +#if F_SETLK != F_SETLK64 + case F_SETLK64: +#endif + case F_SETLK: + cmd_str = "SETLK"; + break; + +#if F_SETLKW != F_SETLKW64 + case F_SETLKW64: +#endif + case F_SETLKW: + cmd_str = "SETLKW"; + break; + + default: + cmd_str = "<null>"; + break; + } + + switch (flock->l_type) { + case F_RDLCK: + type_str = "READ"; + break; + case F_WRLCK: + type_str = "WRITE"; + break; + case F_UNLCK: + type_str = "UNLOCK"; + break; + default: + type_str = "UNKNOWN"; + break; + } + + snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " + "start=%llu, len=%llu, pid=%llu, lk-owner=%s", + cmd_str, type_str, (unsigned long long) flock->l_start, + (unsigned long long) flock->l_len, + (unsigned long long) flock->l_pid, + lkowner_utoa (owner)); + +} + +static void +afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, + int child_index) +{ + snprintf (str, size, "path=%s, fd=%p, child=%d", + loc->path ? loc->path : "<nul>", + fd ? fd : NULL, + child_index); +} + +void +afr_print_entrylk (char *str, int size, const char *basename, + gf_lkowner_t *owner) +{ + snprintf (str, size, "Basename=%s, lk-owner=%s", + basename ? basename : "<nul>", + lkowner_utoa (owner)); +} + +static void +afr_print_verdict (int op_ret, int op_errno, char *str) +{ + if (op_ret < 0) { + if (op_errno == EAGAIN) + strcpy (str, "EAGAIN"); + else + strcpy (str, "FAILED"); + } + else + strcpy (str, "GRANTED"); +} + +static void +afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, + char *lock_call_type_str, + afr_internal_lock_t *int_lock) +{ + switch (lock_call_type) { + case AFR_INODELK_TRANSACTION: + if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) + strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION"); + else + strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL"); + break; + case AFR_INODELK_NB_TRANSACTION: + if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) + strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION"); + else + strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL"); + break; + case AFR_ENTRYLK_TRANSACTION: + if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) + strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION"); + else + strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL"); + break; + case AFR_ENTRYLK_NB_TRANSACTION: + if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) + strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION"); + else + strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL"); + break; + default: + strcpy (lock_call_type_str, "UNKNOWN"); + break; + } + +} + +static void +afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, + afr_lock_op_type_t lk_op_type, struct gf_flock *flock, + int op_ret, int op_errno, int32_t child_index) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + char lockee[256]; + char lock_call_type_str[256]; + char verdict[16]; + + local = frame->local; + int_lock = &local->internal_lock; + + afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + + afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); + + afr_print_verdict (op_ret, op_errno, verdict); + + gf_log (this->name, GF_LOG_INFO, + "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", + lock_call_type_str, + lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", + verdict, lkowner_utoa (&frame->root->lk_owner), lockee, + (unsigned long long) int_lock->lock_number); + +} + +static void +afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, + afr_lock_op_type_t lk_op_type, struct gf_flock *flock, + int32_t cmd, int32_t child_index) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + + char lock[256]; + char lockee[256]; + char lock_call_type_str[256]; + + local = frame->local; + int_lock = &local->internal_lock; + + afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + + afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); + + gf_log (this->name, GF_LOG_INFO, + "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", + lock_call_type_str, + lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", + lock, lockee, + (unsigned long long) int_lock->lock_number); + +} + +static void +afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, + afr_lock_op_type_t lk_op_type, const char *basename, + int32_t cookie) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; + int child_index = 0; + int lockee_no = 0; + + char lock[256]; + char lockee[256]; + char lock_call_type_str[256]; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + if (!priv->entrylk_trace) { + return; + } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; + + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); + + afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); + + gf_log (this->name, GF_LOG_INFO, + "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", + lock_call_type_str, + lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", + lock, lockee, + (unsigned long long) int_lock->lock_number, + cookie); +} + +static void +afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, + afr_lock_op_type_t lk_op_type, const char *basename, + int op_ret, int op_errno, int32_t cookie) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lockee_no = 0; + int child_index = 0; + + char lock[256]; + char lockee[256]; + char lock_call_type_str[256]; + char verdict[16]; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + if (!priv->entrylk_trace) { + return; + } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; + + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); + + afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); + + afr_print_verdict (op_ret, op_errno, verdict); + + gf_log (this->name, GF_LOG_INFO, + "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", + lock_call_type_str, + lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", + verdict, + lock, lockee, + (unsigned long long) int_lock->lock_number, + cookie); + +} + +static int +transaction_lk_op (afr_local_t *local) +{ + afr_internal_lock_t *int_lock = NULL; + int ret = -1; + + int_lock = &local->internal_lock; + + if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) { + gf_log (THIS->name, GF_LOG_DEBUG, + "lk op is for a transaction"); + ret = 1; + } + else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) { + gf_log (THIS->name, GF_LOG_DEBUG, + "lk op is for a self heal"); + + ret = 0; + } + + if (ret == -1) + gf_log (THIS->name, GF_LOG_DEBUG, + "lk op is not set"); + + return ret; + +} + +static int +is_afr_lock_transaction (afr_local_t *local) +{ + int ret = 0; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + ret = 1; + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: + ret = 0; + break; + + } + + return ret; +} + +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count) +{ + int ret = -1; + + loc_copy (&lockee->loc, loc); + lockee->basename = (basename)? gf_strdup (basename): NULL; + if (basename && !lockee->basename) + goto out; + + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC (child_count, + sizeof (*lockee->locked_nodes), + gf_afr_mt_afr_node_character); + + if (!lockee->locked_nodes) + goto out; + + ret = 0; +out: + return ret; + +} + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) +{ + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) { + loc_wipe (&int_lock->lockee[i].loc); + if (int_lock->lockee[i].basename) + GF_FREE (int_lock->lockee[i].basename); + if (int_lock->lockee[i].locked_nodes) + GF_FREE (int_lock->lockee[i].locked_nodes); + } + + return; +} + +static int +initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; + + int i = 0; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + + int_lock->entrylk_lock_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset (int_lock->lockee[i].locked_nodes, 0, + sizeof (*int_lock->lockee[i].locked_nodes) * + priv->child_count); + } + + return 0; +} + +static int +initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; + afr_inodelk_t *inodelk = NULL; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + inodelk->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); + + return 0; +} + +int +afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) +{ + int call_count = 0; + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; + + return call_count; +} + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) + +{ + int i = 0; + int call_count = 0; + + for (i = 0; i < child_count; i++) { + if (locked_nodes[i] & LOCKED_YES) + call_count++; + } + + return call_count; +} + +/* FIXME: What if UNLOCK fails */ +static int32_t +afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + int call_count = 0; + + local = frame->local; + int_lock = &local->internal_lock; + + LOCK (&frame->lock); + { + call_count = --int_lock->lk_call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + gf_log (this->name, GF_LOG_TRACE, + "All internal locks unlocked"); + int_lock->lock_cbk (frame, this); + } + + return 0; +} + +static int32_t +afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, NULL, op_ret, + op_errno, child_index); + + priv = this->private; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " + "with lock owner %s", local->loc.path, + priv->children[child_index]->name, + lkowner_utoa (&frame->root->lk_owner)); + } + + + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata); + + return 0; + +} + +static int +afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; + int call_count = 0; + int i = 0; + int piggyback = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = F_UNLCK; + + full_flock.l_type = F_UNLCK; + call_count = afr_locked_nodes_count (inodelk->locked_nodes, + priv->child_count); + + int_lock->lk_call_count = call_count; + + if (!call_count) { + gf_log (this->name, GF_LOG_TRACE, + "No internal locks unlocked"); + int_lock->lock_cbk (frame, this); + goto out; + } + + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + + for (i = 0; i < priv->child_count; i++) { + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) + continue; + + if (local->fd) { + flock_use = &flock; + if (!local->transaction.eager_lock[i]) { + goto wind; + } + + piggyback = 0; + + LOCK (&local->fd->lock); + { + if (fd_ctx->lock_piggyback[i]) { + fd_ctx->lock_piggyback[i]--; + piggyback = 1; + } else { + fd_ctx->lock_acquired[i]--; + } + } + UNLOCK (&local->fd->lock); + + if (piggyback) { + afr_unlock_inodelk_cbk (frame, (void *) (long) i, + this, 1, 0, NULL); + if (!--call_count) + break; + continue; + } + + flock_use = &full_flock; + wind: + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, flock_use, F_SETLK, + i); + + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); + + if (!--call_count) + break; + + } else { + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, &flock, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); + + if (!--call_count) + break; + } + } +out: + return 0; +} + +static int32_t +afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int32_t child_index = 0; + int lockee_no = 0; + + priv = this->private; + lockee_no = (int)((long) cookie) / priv->child_count; + child_index = (int) ((long) cookie) % priv->child_count; + + local = frame->local; + int_lock = &local->internal_lock; + + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, + op_errno, (int) ((long)cookie)); + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: unlock failed on %d, reason: %s", + local->loc.path, child_index, strerror (op_errno)); + } + + int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); + + return 0; +} + +static int +afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int index = 0; + int lockee_no = 0; + int copies = 0; + int i = -1; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + copies = priv->child_count; + + call_count = afr_lockee_locked_nodes_count (int_lock); + + int_lock->lk_call_count = call_count; + + if (!call_count){ + gf_log (this->name, GF_LOG_TRACE, + "No internal locks unlocked"); + int_lock->lock_cbk (frame, this); + goto out; + } + + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_no = i / copies; + index = i % copies; + if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, + i); + + STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, + (void *) (long) i, + priv->children[index], + priv->children[index]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); + + if (!--call_count) + break; + } + } + +out: + return 0; + +} + +static int32_t +afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long) cookie; + int child_index = 0; + int lockee_no = 0; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + + child_index = ((int)cky) % priv->child_count; + lockee_no = ((int)cky) / priv->child_count; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + } + + local->op_errno = op_errno; + int_lock->lock_op_errno = op_errno; + } + + int_lock->lk_attempted_count++; + } + UNLOCK (&frame->lock); + + if ((op_ret == -1) && + (op_errno == ENOSYS)) { + afr_unlock (frame, this); + } else { + if (op_ret == 0) { + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } else { + int_lock->locked_nodes[child_index] |= LOCKED_YES; + int_lock->lock_count++; + } + } + afr_lock_blocking (frame, this, cky + 1); + } + + return 0; +} + +static int32_t +afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, + AFR_LOCK_OP, NULL, op_ret, + op_errno, (long) cookie); + + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); + return 0; + +} + +static int32_t +afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, NULL, op_ret, + op_errno, (long)cookie); + + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); + return 0; +} + +static int +afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: + /*entrylk_count is being used in both non-blocking and blocking + * modes */ + break; + } + + return 0; + +} + +static inline gf_boolean_t +afr_is_entrylk (afr_internal_lock_t *int_lock, + afr_transaction_type trans_type) +{ + gf_boolean_t is_entrylk = _gf_false; + + if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && + int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { + + is_entrylk = _gf_true; + + } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && + (trans_type == AFR_ENTRY_TRANSACTION || + trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { + + is_entrylk = _gf_true; + + } else { + is_entrylk = _gf_false; + } + + return is_entrylk; +} + +static gf_boolean_t +_is_lock_wind_needed (afr_local_t *local, int child_index) +{ + if (!local->child_up[child_index]) + return _gf_false; + + return _gf_true; +} + +int +afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) +{ + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = {0,}; + uint64_t ctx = 0; + int ret = 0; + int child_index = 0; + int lockee_no = 0; + gf_boolean_t is_entrylk = _gf_false; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); + + + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } + + if (local->fd) { + ret = fd_ctx_get (local->fd, this, &ctx); + + if (ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "unable to get fd ctx for fd=%p", + local->fd); + + local->op_ret = -1; + int_lock->lock_op_ret = -1; + + afr_copy_locked_nodes (frame, this); + + afr_unlock (frame, this); + + return 0; + } + } + + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if ((is_entrylk && int_lock->entrylk_lock_count == 0) || + (!is_entrylk && int_lock->lock_count == 0)) { + gf_log (this->name, GF_LOG_INFO, + "unable to lock on even one child"); + + local->op_ret = -1; + int_lock->lock_op_ret = -1; + + afr_copy_locked_nodes (frame, this); + + afr_unlock(frame, this); + + return 0; + } + } + + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + /* we're done locking */ + + gf_log (this->name, GF_LOG_DEBUG, + "we're done locking"); + + afr_copy_locked_nodes (frame, this); + + int_lock->lock_op_ret = 0; + int_lock->lock_cbk (frame, this); + return 0; + } + + if (!_is_lock_wind_needed (local, child_index)) { + afr_lock_blocking (frame, this, cookie + 1); + return 0; + } + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + + if (local->fd) { + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_LOCK_OP, &flock, F_SETLKW, + child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->finodelk, + int_lock->domain, local->fd, + F_SETLKW, &flock, NULL); + + } else { + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_LOCK_OP, &flock, F_SETLKW, + child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLKW, &flock, NULL); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: + /*Accounting for child_index increments on 'down' + *and 'fd-less' children */ + + if (local->fd) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + cookie); + + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, + priv->children[child_index], + priv->children[child_index]->fops->fentrylk, + int_lock->domain, local->fd, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } else { + AFR_TRACE_ENTRYLK_IN (frame, this, + AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, local->transaction.basename, + child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } + + break; + } + + return 0; +} + +int32_t +afr_blocking_lock (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int up_count = 0; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + initialize_inodelk_variables (frame, this); + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: + up_count = AFR_COUNT (local->child_up, priv->child_count); + int_lock->lk_call_count = int_lock->lk_expected_count + = (int_lock->lockee_count * + up_count); + initialize_entrylk_variables (frame, this); + break; + } + + afr_lock_blocking (frame, this, 0); + + return 0; +} + +static int32_t +afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int child_index = (long) cookie; + int copies = 0; + int index = 0; + int lockee_no = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + copies = priv->child_count; + index = child_index % copies; + lockee_no = child_index / copies; + + local = frame->local; + int_lock = &local->internal_lock; + + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, + op_errno, (long) cookie); + + LOCK (&frame->lock); + { + if (op_ret < 0 ) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_no].locked_nodes[index] |= \ + LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } + + call_count = --int_lock->lk_call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + gf_log (this->name, GF_LOG_TRACE, + "Last locking reply received"); + /* all locks successful. Proceed to call FOP */ + if (int_lock->entrylk_lock_count == + int_lock->lk_expected_count) { + gf_log (this->name, GF_LOG_TRACE, + "All servers locked. Calling the cbk"); + int_lock->lock_op_ret = 0; + int_lock->lock_cbk (frame, this); + } + /* Not all locks were successful. Unlock and try locking + again, this time with serially blocking locks */ + else { + gf_log (this->name, GF_LOG_TRACE, + "%d servers locked. Trying again with blocking calls", + int_lock->lock_count); + + afr_unlock(frame, this); + } + } + + return 0; +} + +int +afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int copies = 0; + int index = 0; + int lockee_no = 0; + int32_t call_count = 0; + int i = 0; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + copies = priv->child_count; + initialize_entrylk_variables (frame, this); + + if (local->fd) { + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { + gf_log (this->name, GF_LOG_INFO, + "unable to get fd ctx for fd=%p", + local->fd); + + local->op_ret = -1; + int_lock->lock_op_ret = -1; + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + + afr_unlock (frame, this); + return -1; + } + + call_count = int_lock->lockee_count * internal_lock_count (frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + if (!call_count) { + gf_log (this->name, GF_LOG_INFO, + "fd not open on any subvolumes. aborting."); + afr_unlock (frame, this); + goto out; + } + + /* Send non-blocking entrylk calls only on up children + and where the fd has been opened */ + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, + priv->children[index], + priv->children[index]->fops->fentrylk, + this->name, local->fd, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); + if (!--call_count) + break; + } + } + } else { + call_count = int_lock->lockee_count * internal_lock_count (frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, + priv->children[index], + priv->children[index]->fops->entrylk, + this->name, &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); + + if (!--call_count) + break; + } + } + } +out: + return 0; +} + +int32_t +afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int child_index = (long) cookie; + afr_fd_ctx_t *fd_ctx = NULL; + + + local = frame->local; + int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, + AFR_LOCK_OP, NULL, op_ret, + op_errno, (long) cookie); + + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + + LOCK (&frame->lock); + { + if (op_ret < 0) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on " + "server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + } else { + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; + + if (local->transaction.eager_lock && + local->transaction.eager_lock[child_index] && + local->fd) { + /* piggybacked */ + if (op_ret == 1) { + /* piggybacked */ + } else if (op_ret == 0) { + /* lock acquired from server */ + fd_ctx->lock_acquired[child_index]++; + } + } + } + + call_count = --int_lock->lk_call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + gf_log (this->name, GF_LOG_TRACE, + "Last inode locking reply received"); + /* all locks successful. Proceed to call FOP */ + if (inodelk->lock_count == int_lock->lk_expected_count) { + gf_log (this->name, GF_LOG_TRACE, + "All servers locked. Calling the cbk"); + int_lock->lock_op_ret = 0; + int_lock->lock_cbk (frame, this); + } + /* Not all locks were successful. Unlock and try locking + again, this time with serially blocking locks */ + else { + gf_log (this->name, GF_LOG_TRACE, + "%d servers locked. Trying again with blocking calls", + int_lock->lock_count); + + afr_unlock(frame, this); + } + } + + return 0; +} + +int +afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int32_t call_count = 0; + int i = 0; + int ret = 0; + struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; + int piggyback = 0; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + + full_flock.l_type = inodelk->flock.l_type; + + initialize_inodelk_variables (frame, this); + + if (local->fd) { + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { + gf_log (this->name, GF_LOG_INFO, + "unable to get fd ctx for fd=%p", + local->fd); + + local->op_ret = -1; + int_lock->lock_op_ret = -1; + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + + afr_unlock (frame, this); + ret = -1; + goto out; + } + + call_count = internal_lock_count (frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + if (!call_count) { + gf_log (this->name, GF_LOG_INFO, + "fd not open on any subvolumes. aborting."); + afr_unlock (frame, this); + goto out; + } + + /* Send non-blocking inodelk calls only on up children + and where the fd has been opened */ + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + flock_use = &flock; + if (!local->transaction.eager_lock_on) { + goto wind; + } + + piggyback = 0; + local->transaction.eager_lock[i] = 1; + + afr_set_delayed_post_op (frame, this); + + LOCK (&local->fd->lock); + { + if (fd_ctx->lock_acquired[i]) { + fd_ctx->lock_piggyback[i]++; + piggyback = 1; + } + } + UNLOCK (&local->fd->lock); + + if (piggyback) { + /* (op_ret == 1) => indicate piggybacked lock */ + afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, + this, 1, 0, NULL); + if (!--call_count) + break; + continue; + } + flock_use = &full_flock; + wind: + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, + AFR_LOCK_OP, flock_use, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); + + if (!--call_count) + break; + } + } else { + call_count = internal_lock_count (frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, + AFR_LOCK_OP, &flock, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); + + if (!--call_count) + break; + } + } +out: + return ret; +} + +int32_t +afr_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (transaction_lk_op (local)) { + if (is_afr_lock_transaction (local)) + afr_unlock_inodelk (frame, this); + else + afr_unlock_entrylk (frame, this); + + } else { + if (is_afr_lock_selfheal (local)) + afr_unlock_inodelk (frame, this); + else + afr_unlock_entrylk (frame, this); + } + + return 0; +} + +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) +{ + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; + + src_local = src->local; + src_lock = &src_local->internal_lock; + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) + goto out; + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); + } + + dst_lock->transaction_lk_type = src_lock->transaction_lk_type; + dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; +out: + return ret; +} diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h new file mode 100644 index 000000000..05df90cc0 --- /dev/null +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -0,0 +1,49 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __AFR_MEM_TYPES_H__ +#define __AFR_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_afr_mem_types_ { + gf_afr_mt_iovec = gf_common_mt_end + 1, + gf_afr_mt_afr_fd_ctx_t, + gf_afr_mt_afr_private_t, + gf_afr_mt_int32_t, + gf_afr_mt_char, + gf_afr_mt_xattr_key, + gf_afr_mt_dict_t, + gf_afr_mt_xlator_t, + gf_afr_mt_iatt, + gf_afr_mt_int, + gf_afr_mt_afr_node_character, + gf_afr_mt_sh_diff_loop_state, + gf_afr_mt_uint8_t, + gf_afr_mt_loc_t, + gf_afr_mt_entry_name, + gf_afr_mt_pump_priv, + gf_afr_mt_locked_fd, + gf_afr_mt_inode_ctx_t, + gf_afr_fd_paused_call_t, + gf_afr_mt_crawl_data_t, + gf_afr_mt_brick_pos_t, + gf_afr_mt_shd_bool_t, + gf_afr_mt_shd_timer_t, + gf_afr_mt_shd_event_t, + gf_afr_mt_time_t, + gf_afr_mt_pos_data_t, + gf_afr_mt_reply_t, + gf_afr_mt_subvol_healer_t, + gf_afr_mt_end +}; +#endif + diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c new file mode 100644 index 000000000..f86aa7fd8 --- /dev/null +++ b/xlators/cluster/afr/src/afr-open.c @@ -0,0 +1,337 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" +#include "statedump.h" + +#include "fd.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" + + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} + + +int +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, + local->fd, xdata); + return 0; +} + + +int +afr_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd, dict_t *xdata) +{ + afr_local_t * local = NULL; + int call_count = -1; + int child_index = (long) cookie; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) { + STACK_WIND (frame, afr_open_ftruncate_cbk, + this, this->fops->ftruncate, + fd, 0, NULL); + } else { + AFR_STACK_UNWIND (open, frame, local->op_ret, + local->op_errno, local->fd, + local->xdata_rsp); + } + } + + return 0; +} + +int +afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + //We can't let truncation to happen outside transaction. + + priv = this->private; + + if (flags & (O_CREAT|O_TRUNC)) { + QUORUM_CHECK(open,out); + } + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; + fd_ctx->flags = flags; + + call_count = local->call_count; + + local->cont.open.flags = flags; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + loc, (flags & ~O_TRUNC), fd, xdata); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL); + + return 0; +} + +int +afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long) cookie; + + priv = this->private; + local = frame->local; + + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened " + "successfully on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, "Failed to open %s " + "on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } + + fd_ctx = local->fd_ctx; + + LOCK (&local->fd->lock); + { + if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + } + UNLOCK (&local->fd->lock); + + call_count = afr_frame_return (frame); + if (call_count == 0) + AFR_STACK_DESTROY (frame); + + return 0; +} + + +static int +afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open) +{ + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int count = 0; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return 0; + + LOCK (&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && + priv->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + need_open[i] = 1; + count++; + } else { + need_open[i] = 0; + } + } + } + UNLOCK (&fd->lock); + + return count; +} + + +void +afr_fix_open (fd_t *fd, xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *need_open = NULL; + int call_count = 0; + + priv = this->private; + + if (!afr_is_fd_fixable (fd)) + goto out; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; + + need_open = alloca0 (priv->child_count); + + call_count = afr_fd_ctx_need_open (fd, this, need_open); + if (!call_count) + goto out; + + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; + + local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; + + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + call_count); + + for (i = 0; i < priv->child_count; i++) { + if (!need_open[i]) + continue; + + if (IA_IFDIR == fd->inode->ia_type) { + gf_log (this->name, GF_LOG_DEBUG, + "opening fd for dir %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, local->fd, + NULL); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "opening fd for file %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, + (void *)(long) i, + priv->children[i], + priv->children[i]->fops->open, + &local->loc, + fd_ctx->flags & (~O_TRUNC), + local->fd, NULL); + } + + if (!--call_count) + break; + } + + return; +out: + if (frame) + AFR_STACK_DESTROY (frame); +} diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c new file mode 100644 index 000000000..186f68c33 --- /dev/null +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -0,0 +1,239 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-transaction.h" + +int +afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int subvol = -1; + + local = frame->local; + priv = this->private; + + + for (i = 0; i < priv->child_count; i++) { + if (!local->readable[i]) { + /* don't even bother trying here. + just mark as attempted and move on. */ + local->read_attempted[i] = 1; + continue; + } + + if (!local->read_attempted[i]) { + subvol = i; + break; + } + } + + /* If no more subvols were available for reading, we leave + @subvol as -1, which is an indication we have run out of + readable subvols. */ + if (subvol != -1) + local->read_attempted[subvol] = 1; + local->readfn (frame, this, subvol); + + return 0; +} + + +int +afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) +{ + afr_local_t *local = NULL; + int read_subvol = 0; + int event_generation = 0; + inode_t *inode = NULL; + int ret = -1; + + local = frame->local; + inode = local->inode; + + if (err) { + local->op_errno = -err; + local->op_ret = -1; + read_subvol = -1; + goto readfn; + } + + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, + local->transaction.type); + + if (ret == -1 || !event_generation) { + /* Even after refresh, we don't have a good + read subvolume. Time to bail */ + local->op_ret = -1; + local->op_errno = EIO; + read_subvol = -1; + goto readfn; + } + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = EIO; + goto readfn; + } + + if (local->read_attempted[read_subvol]) { + afr_read_txn_next_subvol (frame, this); + return 0; + } + + local->read_attempted[read_subvol] = 1; +readfn: + local->readfn (frame, this, read_subvol); + + return 0; +} + + +int +afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (!local->refreshed) { + local->refreshed = _gf_true; + afr_inode_refresh (frame, this, local->inode, + afr_read_txn_refresh_done); + } else { + afr_read_txn_next_subvol (frame, this); + } + + return 0; +} + + +/* afr_read_txn_wipe: + + clean internal variables in @local in order to make + it possible to call afr_read_txn() multiple times from + the same frame +*/ + +void +afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + local->readfn = NULL; + + if (local->inode) + inode_unref (local->inode); + + for (i = 0; i < priv->child_count; i++) { + local->read_attempted[i] = 0; + local->readable[i] = 0; + } +} + + +/* + afr_read_txn: + + This is the read transaction function. The way it works: + + - Determine read-subvolume from inode ctx. + + - If read-subvolume's generation was stale, refresh ctx once by + calling afr_inode_refresh() + + Else make an attempt to read on read-subvolume. + + - If attempted read on read-subvolume fails, refresh ctx once + by calling afr_inode_refresh() + + - After ctx refresh, query read-subvolume freshly and attempt + read once. + + - If read fails, try every other readable[] subvolume before + finally giving up. readable[] elements are set by afr_inode_refresh() + based on dirty and pending flags. + + - If file is in split brain in the backend, generation will be + kept 0 by afr_inode_refresh() and readable[] will be set 0 for + all elements. Therefore reads always fail. +*/ + +int +afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + + afr_read_txn_wipe (frame, this); + + local->readfn = readfn; + local->inode = inode_ref (inode); + + local->transaction.type = type; + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, type); + if (ret == -1) + /* very first transaction on this inode */ + goto refresh; + + if (local->event_generation != event_generation) + /* servers have disconnected / reconnected, and possibly + rebooted, very likely changing the state of freshness + of copies */ + goto refresh; + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol < 0 || read_subvol > priv->child_count) { + gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d " + "found with event generation %d", read_subvol, + event_generation); + goto refresh; + } + + if (!local->child_up[read_subvol]) { + /* should never happen, just in case */ + gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the " + "read subvolume in this generation, but is not up", + read_subvol); + goto refresh; + } + + local->read_attempted[read_subvol] = 1; + + local->readfn (frame, this, read_subvol); + + return 0; + +refresh: + afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done); + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 5f08fb007..4dac83113 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,584 +1,383 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" +#include "byte-order.h" -/** - * select_source - select a source and return it - */ - int -afr_sh_select_source (int sources[], int child_count) +afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int i; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; - - return -1; -} + afr_local_t *local = NULL; + local = frame->local; -/** - * sink_count - return number of sinks in sources array - */ + syncbarrier_wake (&local->barrier); -int -afr_sh_sink_count (int sources[], int child_count) -{ - int i; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; + return 0; } + int -afr_sh_source_count (int sources[], int child_count) +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr) { - int i; - int nsource = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = {0, }; - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; -} + priv = this->private; + local = frame->local; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count) -{ - int i = 0; + STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); - for (i = 0; i < child_count; i++) { - if (child_errno[i] && sources[i]) { - sources[i] = 0; - } - } + syncbarrier_wait (&local->barrier, 1); return 0; } -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +dict_t * +afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, + int *output_dirty, int **output_matrix, int subvol) { - afr_private_t * priv = this->private; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + int j = 0; + int idx = 0; + int ret = 0; + int *raw = 0; - char *buf = NULL; - char *ptr = NULL; + priv = this->private; + idx = afr_index_for_transaction_type (type); - int i, j; + xattr = dict_new (); + if (!xattr) + return NULL; - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = MALLOC (priv->child_count * 11 + 8); + if (output_dirty[subvol]) { + /* clear dirty */ + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - ptr += sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_TRACE, - "pending_matrix: %s", buf); + raw[idx] = hton32 (output_dirty[subvol]); + ret = dict_set_bin (xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; } - FREE (buf); -} - - -void -afr_sh_build_pending_matrix (afr_private_t *priv, - int32_t *pending_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) -{ - int i, j, k; - - int32_t *pending = NULL; - int ret = -1; + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + if (!output_matrix[subvol][j]) + continue; - unsigned char *ignorant_subvols = NULL; + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, + gf_afr_mt_int32_t); + if (!raw) + goto err; - ignorant_subvols = CALLOC (sizeof (*ignorant_subvols), child_count); + raw[idx] = hton32 (output_matrix[subvol][j]); - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } + ret = dict_set_bin (xattr, priv->pending_key[j], + raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; } - for (i = 0; i < child_count; i++) { - pending = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - VOID(&pending)); - - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ - - ignorant_subvols[i] = 1; - continue; - } - - k = afr_index_for_transaction_type (type); - - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } - - /* - * Make all non-ignorant subvols point towards the ignorant - * subvolumes. - */ - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } - - FREE (ignorant_subvols); + return xattr; +err: + if (xattr) + dict_unref (xattr); + return NULL; } -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ - -typedef enum { - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE -} afr_node_type; - -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; - - -static int -afr_sh_is_innocent (int32_t *array, int child_count) +int +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on) { - int i = 0; - int ret = 1; /* innocent until proven guilty */ - - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; - return ret; -} + priv = this->private; + pending = alloca0 (priv->child_count); -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ - return array[i]; /* fool if accuses itself */ -} + input_dirty = alloca0 (priv->child_count * sizeof (int)); + input_matrix = ALLOC_MATRIX (priv->child_count, int); + output_dirty = alloca0 (priv->child_count * sizeof (int)); + output_matrix = ALLOC_MATRIX (priv->child_count, int); + afr_selfheal_extract_xattr (this, replies, type, input_dirty, + input_matrix); -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) -{ - return !array[i]; /* wise if does not accuse itself */ -} - + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) + output_matrix[i][j] = 1; + else + output_matrix[i][j] = -input_matrix[i][j]; + } + } - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } - } + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } - return ret; -} + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + xattr = afr_selfheal_output_xattr (this, type, output_dirty, + output_matrix, i); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "unable to allocate xdata for subvol %d", i); + continue; + } -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) -{ - int i = 0; - int ret = 0; + afr_selfheal_post_op (frame, this, inode, i, xattr); - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } - } + dict_unref (xattr); + } - return ret; + return 0; } -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occured. - */ - -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) +void +afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) { - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; - - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { - - characters[i].wisdom = 0; - } - } - } - } + int i = 0; + dict_t *xdata = NULL; + + if (dst == src) + return; + + for (i = 0; i < count; i++) { + dst[i].valid = src[i].valid; + dst[i].op_ret = src[i].op_ret; + dst[i].op_errno = src[i].op_errno; + dst[i].prestat = src[i].prestat; + dst[i].poststat = src[i].poststat; + dst[i].preparent = src[i].preparent; + dst[i].postparent = src[i].postparent; + dst[i].preparent2 = src[i].preparent2; + dst[i].postparent2 = src[i].postparent2; + if (src[i].xdata) + xdata = dict_ref (src[i].xdata); + else + xdata = NULL; + if (dst[i].xdata) + dict_unref (dst[i].xdata); + dst[i].xdata = xdata; + memcpy (dst[i].checksum, src[i].checksum, + MD5_DIGEST_LENGTH); + } } -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) +int +afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, + int idx, dict_t *xdata) { - int i = 0; - int ret = 1; + void *pending_raw = NULL; + int pending[3] = {0, }; - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) + return -1; - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + if (!pending_raw) + return -1; - return ret; -} + memcpy (pending, pending_raw, sizeof(pending)); + dirty[subvol] = ntoh32 (pending[idx]); -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) -{ - int nsources = 0; - - int i = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } - - return nsources; + return 0; } -static int -afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count) +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, + int idx, dict_t *xdata) { - int32_t ** pending_matrix; - int i, j; - - int size_differs = 0; - - pending_matrix = sh->pending_matrix; - - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j]) - && (pending_matrix[i][j] == 0) - && (pending_matrix[j][i] == 0)) { - - pending_matrix[i][j] = 1; - pending_matrix[j][i] = 1; - - size_differs = 1; - } - } - } - - return size_differs; -} + int i = 0; + void *pending_raw = NULL; + int pending[3] = {0, }; + afr_private_t *priv = NULL; - -static int -afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh, - afr_node_character *characters, - int child_count) -{ - int i = 0; - int biggest = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_FOOL) { - biggest = i; - break; - } - } - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; - } - } - - sh->sources[biggest] = 1; - - return 1; -} + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) + continue; -static int -afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count) -{ - int biggest = 0; - int i; + if (!pending_raw) + continue; - for (i = 0; i < child_count; i++) { - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; - } - } + memcpy (pending, pending_raw, sizeof(pending)); - sh->sources[biggest] = 1; + matrix[subvol][i] = ntoh32 (pending[idx]); + } - return 1; + return 0; } int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type) +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { + afr_private_t *priv = NULL; int i = 0; + dict_t *xdata = NULL; + int idx = -1; - int32_t ** pending_matrix; - int * sources; + idx = afr_index_for_transaction_type (type); - int size_differs = 0; - - pending_matrix = sh->pending_matrix; - sources = sh->sources; + priv = this->private; - int nsources = 0; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].xdata) + continue; - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character * - characters = CALLOC (sizeof (afr_node_character), - child_count); + xdata = replies[i].xdata; - /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; + afr_selfheal_fill_dirty (this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); } - - for (i = 0; i < child_count; i++) { - if (afr_sh_is_innocent (pending_matrix[i], child_count)) { - characters[i].type = AFR_NODE_INNOCENT; - - } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_FOOL; - - } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_WISE; - - } else { - gf_log ("[module:replicate]", GF_LOG_ERROR, - "Could not determine the state of subvolume %d!" - " (This message should never appear." - " Please file a bug report to " - "<gluster-devel@nongnu.org>.)", i); - } - } - - if (type == AFR_SELF_HEAL_DATA) { - size_differs = afr_sh_mark_if_size_differs (sh, child_count); - } - - if (afr_sh_all_nodes_innocent (characters, child_count)) { - if (size_differs) { - nsources = afr_sh_mark_biggest_as_source (sh, - child_count); - } - - } else if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); - - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - - nsources = -1; - goto out; - - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); - } - } else { - nsources = afr_sh_mark_biggest_fool_as_source (sh, characters, - child_count); - } - -out: - FREE (characters); - return nsources; + return 0; } -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], - int child_count, afr_transaction_type type) -{ - int i = 0; - int j = 0; - int k = 0; - - int32_t * pending = NULL; - int ret = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - pending = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - VOID(&pending)); - - if (!success[j]) - continue; - - k = afr_index_for_transaction_type (type); - - if (pending) { - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; - } - - } - } -} +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. + * + */ int -afr_sh_delta_to_xattr (afr_private_t *priv, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks) { + afr_private_t *priv = NULL; int i = 0; int j = 0; - int k = 0; + int *dirty = NULL; + int **matrix = NULL; + char *accused = NULL; - int ret = 0; + priv = this->private; - int32_t *pending = 0; + dirty = alloca0 (priv->child_count * sizeof (int)); + accused = alloca0 (priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); + + /* Next short list all accused to exclude them from being sources */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + accused[j] = 1; + } + } - for (j = 0; j < child_count; j++) { - pending = CALLOC (sizeof (int32_t), 3); - /* 3 = data+metadata+entry */ + /* Short list all non-accused as sources */ + memset (sources, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + } - k = afr_index_for_transaction_type (type); + /* Everyone accused by sources are sinks */ + memset (sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; + } + } - pending[k] = hton32 (delta_matrix[i][j]); + /* If any source has 'dirty' bit, pick first + 'dirty' source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && dirty[i]) { + for (j = 0; j < priv->child_count; j++) { + if (j != i) { + sources[j] = 0; + sinks[j] = 1; + } + } + break; + } + } - ret = dict_set_bin (xattr[i], priv->pending_key[j], - pending, - 3 * sizeof (int32_t)); + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; } } @@ -587,782 +386,624 @@ afr_sh_delta_to_xattr (afr_private_t *priv, int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) { - afr_private_t *priv = NULL; - int32_t *pending = NULL; - void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ - - int ret = -1; - int i = 0; - int j = 0; + afr_local_t *local = NULL; + int i = -1; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &tmp_pending); - - if (ret != 0) - return 0; - - pending = tmp_pending; + local = frame->local; + i = (long) cookie; - j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); - if (pending[j]) - return 1; - } + syncbarrier_wake (&local->barrier); return 0; } -int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on) { + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; - int32_t *pending = NULL; - void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + inode_t *inode = NULL; - int ret = -1; - int i = 0; - int j = 0; + local = frame->local; + priv = frame->this->private; - priv = this->private; + xattr_req = dict_new (); + if (!xattr_req) + return NULL; + + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return NULL; + } - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &tmp_pending); + inode = inode_new (parent->table); + if (!inode) { + dict_destroy (xattr_req); + return NULL; + } - if (ret != 0) - return 0; - - pending = tmp_pending; + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref (inode); - j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - if (pending[j]) - return 1; - } + afr_replies_copy (replies, local->replies, priv->child_count); - return 0; + loc_wipe (&loc); + dict_unref (xattr_req); + + return inode; } int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on) { - afr_private_t *priv = NULL; - int32_t *pending = NULL; - void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; + local = frame->local; + priv = frame->this->private; - priv = this->private; + xattr_req = dict_new (); + if (!xattr_req) + return -ENOMEM; - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &tmp_pending); + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return -ENOMEM; + } - if (ret != 0) - return 0; - - pending = tmp_pending; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, gfid); - j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); + AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - if (pending[j]) - return 1; - } + afr_replies_copy (replies, local->replies, priv->child_count); + + loc_wipe (&loc); + dict_unref (xattr_req); return 0; } - -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ - int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies) { - int i, j; + afr_private_t *priv = NULL; + + priv = frame->this->private; - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; + return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, + priv->child_up); } int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *local = NULL; + int i = 0; local = frame->local; - sh = &local->self_heal; - priv = this->private; + i = (long) cookie; -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_TRACE, - "aborting selfheal of %s", - local->loc.path); - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } + syncbarrier_wake (&local->barrier); return 0; } int -sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; local = frame->local; - sh = &local->self_heal; priv = this->private; - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_missing_entries_done (frame, this); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; + } } - return 0; + return count; } - - -static int -sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_self_heal_t *sh = NULL; - local = frame->local; - sh = &local->self_heal; - priv = this->private; +int +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = {0,}; + struct gf_flock flock = {0, }; - call_count = local->child_count; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - local->call_count = call_count; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "unlocking %"PRId64"/%s on subvolume %s", - sh->parent_loc.inode->ino, local->loc.name, - priv->children[i]->name); - - STACK_WIND (frame, sh_missing_entries_unlck_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &sh->parent_loc, local->loc.name, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - if (!--call_count) - break; - } - } - return 0; -} + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); + loc_wipe (&loc); -static int -sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int op_errno, struct stat *stbuf) -{ - STACK_DESTROY (frame->root); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } -static int -sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *stbuf) +int +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - call_frame_t *chown_frame = NULL; - int call_count = 0; - int child_index = 0; - struct stat *buf = NULL; - + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - local = frame->local; - sh = &local->self_heal; priv = this->private; + local = frame->local; - buf = &sh->buf[sh->source]; - child_index = (long) cookie; - - if (op_ret == 0) { - chown_frame = copy_frame (frame); + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - gf_log (this->name, GF_LOG_TRACE, - "chown %s to %d %d on subvolume %s", - local->loc.path, buf->st_uid, buf->st_gid, - priv->children[child_index]->name); + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - STACK_WIND (chown_frame, sh_destroy_cbk, - priv->children[child_index], - priv->children[child_index]->fops->chown, - &local->loc, - buf->st_uid, buf->st_gid); - } + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); - LOCK (&frame->lock); - { + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_uninodelk (frame, this, inode, dom, off, + size, locked_on); + + AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLKW, &flock, NULL); + break; + } } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + loc_wipe (&loc); - if (call_count == 0) { - sh_missing_entries_finish (frame, this); - } - - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } -static int -sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - dev_t st_dev = 0; + loc_t loc = {0,}; + struct gf_flock flock = {0, }; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; - call_count = enoent_count; - local->call_count = call_count; + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk, + dom, &loc, F_SETLK, &flock, NULL); - st_mode = sh->buf[sh->source].st_mode; - st_dev = sh->buf[sh->source].st_dev; - - gf_log (this->name, GF_LOG_TRACE, - "mknod %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, st_mode, st_dev); - if (!--call_count) - break; - } - } + loc_wipe (&loc); return 0; } -static int -sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; + loc_t loc = {0,}; - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - call_count = enoent_count; - local->call_count = call_count; + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - st_mode = sh->buf[sh->source].st_mode; - - gf_log (this->name, GF_LOG_TRACE, - "mkdir %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, st_mode); - if (!--call_count) - break; - } - } + loc_wipe (&loc); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } -static int -sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, - const char *link) +int +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int enoent_count = 0; - int call_count = 0; - + loc_t loc = {0,}; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - local = frame->local; - sh = &local->self_heal; priv = this->private; + local = frame->local; - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - gf_log (this->name, GF_LOG_TRACE, - "symlink %s -> %s on %d subvolumes", - local->loc.path, link, enoent_count); + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - link, &local->loc); - if (!--call_count) - break; + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_unentrylk (frame, this, inode, dom, name, + locked_on); + + AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; } } - return 0; -} - - -static int -sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *link) -{ - if (op_ret > 0) - sh_missing_entries_symlink (frame, this, link); - else - sh_missing_entries_finish (frame, this); + loc_wipe (&loc); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } -static int -sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + loc_t loc = {0,}; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - local = frame->local; - sh = &local->self_heal; - priv = this->private; + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, + dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - STACK_WIND (frame, sh_missing_entries_readlink_cbk, - priv->children[sh->source], - priv->children[sh->source]->fops->readlink, - &local->loc, 4096); + loc_wipe (&loc); return 0; } -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_pending_set (xlator_t *this, dict_t *xdata, int type) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - int i = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int govinda_gOvinda = 0; - + int idx = -1; + afr_private_t *priv = NULL; + void *pending_raw = NULL; + int *pending_int = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; priv = this->private; + idx = afr_index_for_transaction_type (type); - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i]) { - if (sh->child_errno[i] == ENOENT) - enoent_count++; - } else { - if (type) { - if (type != (sh->buf[i].st_mode & S_IFMT)) - govinda_gOvinda = 1; - } else { - sh->source = i; - type = sh->buf[i].st_mode & S_IFMT; - } - } - } - - if (govinda_gOvinda) { - gf_log (this->name, GF_LOG_ERROR, - "conflicing filetypes exist for path %s. returning.", - local->loc.path); + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { + if (pending_raw) { + pending_int = pending_raw; - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - return 0; - } - - if (!type) { - gf_log (this->name, GF_LOG_ERROR, - "no source found for %s. all nodes down?. returning.", - local->loc.path); - /* subvolumes down and/or file does not exist */ - sh_missing_entries_finish (frame, this); - return 0; + if (ntoh32 (pending_int[idx])) + return _gf_true; + } } - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_ERROR, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - sh_missing_entries_finish (frame, this); - return 0; - } + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw)) + continue; + if (!pending_raw) + continue; + pending_int = pending_raw; - switch (type) { - case S_IFSOCK: - case S_IFREG: - case S_IFBLK: - case S_IFCHR: - case S_IFIFO: - sh_missing_entries_mknod (frame, this); - break; - case S_IFLNK: - sh_missing_entries_readlink (frame, this); - break; - case S_IFDIR: - sh_missing_entries_mkdir (frame, this); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "unknown file type: 0%o", type); - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); + if (ntoh32 (pending_int[idx])) + return _gf_true; } - return 0; + return _gf_false; } -static int -sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) +gf_boolean_t +afr_is_data_set (xlator_t *this, dict_t *xdata) { - int child_index = 0; - afr_local_t *local = NULL; - int call_count = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - + return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION); +} - local = frame->local; - sh = &local->self_heal; - priv = this->private; +gf_boolean_t +afr_is_metadata_set (xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION); +} - child_index = (long) cookie; +gf_boolean_t +afr_is_entry_set (xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION); +} - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->st_mode); - local->self_heal.buf[child_index] = *buf; - } else { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - local->self_heal.child_errno[child_index] = op_errno; - } +void +afr_inode_link (inode_t *inode, struct iatt *iatt) +{ + inode_t *linked_inode = NULL; - } - UNLOCK (&frame->lock); + linked_inode = inode_link (inode, NULL, NULL, iatt); - call_count = afr_frame_return (frame); + uuid_copy (inode->gfid, iatt->ia_gfid); + inode->ia_type = iatt->ia_type; - if (call_count == 0) { - sh_missing_entries_create (frame, this); + if (linked_inode) { + inode_lookup (linked_inode); + inode_unref (linked_inode); } - - return 0; } -static int -sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection. + */ + +int +afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *inode, uuid_t gfid, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - int ret = -1; + afr_private_t *priv = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = {0, }; + struct afr_reply *replies = NULL; + int ret = -1; - local = frame->local; - call_count = local->child_count; priv = this->private; - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, - priv->pending_key[i], - 3 * sizeof(int32_t)); - } - } + replies = alloca0 (sizeof (*replies) * priv->child_count); + + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + if (ret) + return ret; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, - sh_missing_entries_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; - return 0; -} + if (afr_is_data_set (this, replies[i].xdata)) + *data_selfheal = _gf_true; + if (afr_is_metadata_set (this, replies[i].xdata)) + *metadata_selfheal = _gf_true; -static int -sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = (long) cookie; + if (afr_is_entry_set (this, replies[i].xdata)) + *entry_selfheal = _gf_true; + valid_cnt ++; + if (valid_cnt == 1) { + first = replies[i].poststat; + continue; + } - local = frame->local; - sh = &local->self_heal; + if (!IA_EQUAL (first, replies[i].poststat, type)) { + gf_log (this->name, GF_LOG_ERROR, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_type, + (int) replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + return -EIO; + } - LOCK (&frame->lock); - { - if (op_ret == -1) { - sh->op_failed = 1; + if (!IA_EQUAL (first, replies[i].poststat, uid)) { + gf_log (this->name, GF_LOG_DEBUG, + "UID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + *metadata_selfheal = _gf_true; + } + + if (!IA_EQUAL (first, replies[i].poststat, gid)) { gf_log (this->name, GF_LOG_DEBUG, - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); + "GID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + + *metadata_selfheal = _gf_true; } - } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + if (!IA_EQUAL (first, replies[i].poststat, prot)) { + gf_log (this->name, GF_LOG_DEBUG, + "MODE mismatch %d vs %d on %s for gfid:%s", + (int) st_mode_from_ia (first.ia_prot, 0), + (int) st_mode_from_ia (replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - if (call_count == 0) { - if (sh->op_failed == 1) { - sh_missing_entries_finish (frame, this); - return 0; + *metadata_selfheal = _gf_true; } - sh_missing_entries_lookup (frame, this); + if (IA_ISREG(first.ia_type) && + !IA_EQUAL (first, replies[i].poststat, size)) { + gf_log (this->name, GF_LOG_DEBUG, + "SIZE mismatch %lld vs %lld on %s for gfid:%s", + (long long) first.ia_size, + (long long) replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + + *data_selfheal = _gf_true; + } } + if (valid_cnt > 0) + afr_inode_link (inode, &first); + + if (valid_cnt < 2) + return -ENOTCONN; + return 0; } -static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; + inode_table_t *table = NULL; + inode_t *inode = NULL; + table = this->itable; + if (!table) + return NULL; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + inode = inode_find (table, gfid); + if (inode) + return inode; - gf_log (this->name, GF_LOG_TRACE, - "attempting to recreate missing entries for path=%s", - local->loc.path); + inode = inode_new (table); + if (!inode) + return NULL; - afr_build_parent_loc (&sh->parent_loc, &local->loc); + uuid_copy (inode->gfid, gfid); - call_count = local->child_count; + return inode; +} - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, sh_missing_entries_lk_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &sh->parent_loc, local->loc.name, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); - if (!--call_count) - break; - } - } +call_frame_t * +afr_frame_create (xlator_t *this) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int op_errno = 0; + pid_t pid = -1; - return 0; + frame = create_frame (this, this->ctx->pool); + if (!frame) + return NULL; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + STACK_DESTROY (frame->root); + return NULL; + } + + syncopctx_setfspid (&pid); + + frame->root->pid = pid; + + afr_set_lk_owner (frame, this, frame->root); + + return frame; } +/* + * This is the entry point for healing a given GFID + */ + int -afr_self_heal (call_frame_t *frame, xlator_t *this, - int (*completion_cbk) (call_frame_t *, xlator_t *)) +afr_selfheal (xlator_t *this, uuid_t gfid) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + inode_t *inode = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + inode = afr_inode_find (this, gfid); + if (!inode) + goto out; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + frame = afr_frame_create (this); + if (!frame) + goto out; - gf_log (this->name, GF_LOG_TRACE, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->need_metadata_self_heal, - local->need_data_self_heal, - local->need_entry_self_heal); + ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid, + &data_selfheal, + &metadata_selfheal, + &entry_selfheal); + if (ret) + goto out; - sh->completion_cbk = completion_cbk; + if (data_selfheal) + afr_selfheal_data (frame, this, inode); - sh->buf = CALLOC (priv->child_count, sizeof (struct stat)); - sh->child_errno = CALLOC (priv->child_count, sizeof (int)); - sh->success = CALLOC (priv->child_count, sizeof (int)); - sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *)); - sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count); + if (metadata_selfheal) + afr_selfheal_metadata (frame, this, inode); - sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count); - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = CALLOC (sizeof (int32_t), - priv->child_count); - } + if (entry_selfheal) + afr_selfheal_entry (frame, this, inode); - sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count); - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = CALLOC (sizeof (int32_t), - priv->child_count); - } - - if (local->success_count && local->enoent_count) { - afr_self_heal_missing_entries (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_sh_missing_entries_done (frame, this); - } + inode_forget (inode, 1); +out: + if (inode) + inode_unref (inode); + if (frame) + AFR_STACK_DESTROY (frame); - return 0; + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h deleted file mode 100644 index a311cdf5e..000000000 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __AFR_SELF_HEAL_COMMON_H__ -#define __AFR_SELF_HEAL_COMMON_H__ - -#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512)) - -typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, -} afr_self_heal_type; - -int -afr_sh_select_source (int sources[], int child_count); - -int -afr_sh_sink_count (int sources[], int child_count); - -int -afr_sh_source_count (int sources[], int child_count); - -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count); - -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); - -void -afr_sh_build_pending_matrix (afr_private_t *priv, - int32_t *pending_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); - -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], - int child_count, afr_transaction_type type); - -int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type); - -int -afr_sh_delta_to_xattr (afr_private_t *priv, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); - -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); - - -#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 06372d47d..c0548d995 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,1090 +1,635 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" +#include "byte-order.h" +enum { + AFR_SELFHEAL_DATA_FULL = 0, + AFR_SELFHEAL_DATA_DIFF, +}; -int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) +#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) +static int +__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, uint32_t weak, uint8_t *strong, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = (long) cookie; local = frame->local; - sh = &local->self_heal; - priv = this->private; - /* - TODO: cleanup sh->* - */ - - gf_log (this->name, GF_LOG_TRACE, - "self heal of %s completed", - local->loc.path); - - sh->completion_cbk (frame, this); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (strong) + memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); + syncbarrier_wake (&local->barrier); return 0; } -int -afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static int +attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, struct iatt *post, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; + int i = (long) cookie; + afr_local_t *local = NULL; local = frame->local; - sh = &local->self_heal; - priv = this->private; - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (pre) + local->replies[i].prestat = *pre; + if (post) + local->replies[i].poststat = *post; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); - if (call_count == 0) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - afr_sh_data_done (frame, this); - } + syncbarrier_wake (&local->barrier); return 0; } -int -afr_sh_data_utimes_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) -{ - afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, + fd_t *fd, int source, + unsigned char *healed_sinks, + off_t offset, size_t size) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - int i = 0; - int call_count = 0; - int source = 0; - int active_sinks = 0; - - struct timespec ts[2]; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - active_sinks = sh->active_sinks; - -#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC - ts[0] = sh->buf[source].st_atim; - ts[1] = sh->buf[source].st_mtim; - -#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC - ts[0] = sh->buf[source].st_atimespec; - ts[1] = sh->buf[source].st_mtimespec; -#else - ts[0].tv_sec = sh->buf[source].st_atime; - ts[1].tv_sec = sh->buf[source].st_mtime; -#endif - - if (!sh->healing_fd) { - afr_sh_data_done (frame, this); - return 0; - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + unsigned char *wind_subvols = NULL; + int i = 0; - call_count = (sh->active_sinks + 1) * 2; - local->call_count = call_count; - - /* closed source */ - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[sh->source]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->flush, - sh->healing_fd); - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_utimes_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->utimens, - &local->loc, ts); - - call_count--; + priv = this->private; + local = frame->local; + wind_subvols = alloca0 (priv->child_count); for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - sh->healing_fd); - - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_utimes_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->utimens, - &local->loc, ts); - - if (!--call_count) - break; + if (i == source || healed_sinks[i]) + wind_subvols[i] = 1; } - return 0; -} - - -int -afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - int call_count = 0; - int child_index = (long) cookie; - - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); + AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd, + offset, size, NULL); - call_count = afr_frame_return (frame); + if (!local->replies[source].valid || local->replies[source].op_ret != 0) + return _gf_false; - if (call_count == 0) { - afr_sh_data_close (frame, this); + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + if (memcmp (local->replies[source].checksum, + local->replies[i].checksum, + MD5_DIGEST_LENGTH)) + return _gf_false; } - return 0; + return _gf_true; } -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct afr_reply *replies) { - struct flock flock; - int i = 0; - int call_count = 0; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; - + struct iovec *iovec = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; - local = frame->local; - sh = &local->self_heal; priv = this->private; - call_count = local->child_count; + ret = syncop_readv (priv->children[source], fd, size, offset, 0, + &iovec, &count, &iobref); + if (ret <= 0) + return ret; - local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; - flock.l_start = 0; - flock.l_len = 0; - flock.l_type = F_UNLCK; + /* + * TODO: Use fiemap() and discard() to heal holes + * in the future. + * + * For now, + * + * - if the source had any holes at all, + * AND + * - if we are writing past the original file size + * of the sink + * AND + * - is NOT the last block of the source file. if + * the block contains EOF, it has to be written + * in order to set the file size even if the + * last block is 0-filled. + * AND + * - if the read buffer is filled with only 0's + * + * then, skip writing to this source. We don't depend + * on the write to happen to update the size as we + * have performed an ftruncate() upfront anyways. + */ +#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b))) + if (HAS_HOLES ((&replies[source].poststat)) && + offset >= replies[i].poststat.ia_size && + !is_last_block (offset, size, + replies[source].poststat.ia_size) && + (iov_0filled (iovec, count) == 0)) + continue; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "unlocking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, - &local->loc, F_SETLK, &flock); - if (!--call_count) - break; + ret = syncop_writev (priv->children[i], fd, iovec, count, + offset, iobref, 0); + if (ret != iov_length (iovec, count)) { + /* write() failed on this sink. unset the corresponding + member in sinks[] (which is healed_sinks[] in the + caller) so that this server does NOT get considered + as successfully healed. + */ + healed_sinks[i] = 0; } } + if (iobref) + iobref_unref (iobref); - return 0; -} - - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "finishing data selfheal of %s", local->loc.path); - - afr_sh_data_unlock (frame, this); - - return 0; + return ret; } -int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) +static int +afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, off_t offset, + size_t size, int type, struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; + int ret = -1; + int sink_count = 0; + afr_private_t *priv = NULL; + unsigned char *data_lock = NULL; - local = frame->local; - sh = &local->self_heal; priv = this->private; + sink_count = AFR_COUNT (healed_sinks, priv->child_count); + data_lock = alloca0 (priv->child_count); - LOCK (&frame->lock); + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, + offset, size, data_lock); { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_data_finish (frame, this); - - return 0; -} - - -int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_DATA_TRANSACTION); - - erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); + if (ret < sink_count) { + ret = -ENOTCONN; + goto unlock; } - } - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); + if (type == AFR_SELFHEAL_DATA_DIFF && + __afr_selfheal_data_checksums_match (frame, this, fd, source, + healed_sinks, offset, size)) { + ret = 0; + goto unlock; } - } - FREE (erase_xattr); - return 0; + ret = __afr_selfheal_data_read_write (frame, this, fd, source, + healed_sinks, offset, size, + replies); + } +unlock: + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, + offset, size, data_lock); + return ret; } -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + +static int +afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - priv = this->private; local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) - gf_log (this->name, GF_LOG_DEBUG, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - else - gf_log (this->name, GF_LOG_TRACE, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + priv = this->private; - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - } + AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL); + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret != 0) + /* fsync() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; return 0; } -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; + loc_t loc = {0, }; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sources = sh->sources; - call_count = sh->active_sinks; - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; + AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc, + &replies[source].poststat, + (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL); - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size); - - if (!--call_count) - break; - } + loc_wipe (&loc); return 0; } - -int -afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this); - -int -afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +static int +afr_data_self_heal_type_get (afr_private_t *priv, unsigned char *healed_sinks, + int source, struct afr_reply *replies) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; + int type = AFR_SELFHEAL_DATA_FULL; + int i = 0; - int child_index = (long) cookie; - int call_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, local->loc.path, child_index, sh->offset - op_ret); - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "write to %s failed on subvolume %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_read_write_iter (frame, this); - } - - return 0; + if (priv->data_self_heal_algorithm == NULL) { + type = AFR_SELFHEAL_DATA_FULL; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] && i != source) + continue; + if (replies[i].poststat.ia_size) { + type = AFR_SELFHEAL_DATA_DIFF; + break; + } + } + } else if (strcmp (priv->data_self_heal_algorithm, "full") == 0) { + type = AFR_SELFHEAL_DATA_FULL; + } else if (strcmp (priv->data_self_heal_algorithm, "diff") == 0) { + type = AFR_SELFHEAL_DATA_DIFF; + } + return type; } - -int -afr_sh_data_read_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct stat *buf, - struct iobref *iobref) +static int +afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - - int child_index = (long) cookie; + afr_private_t *priv = NULL; int i = 0; - int call_count = 0; - - off_t offset; + off_t off = 0; + size_t block = 128 * 1024; + int type = AFR_SELFHEAL_DATA_FULL; + int ret = -1; + call_frame_t *iter_frame = NULL; + char *sinks_str = NULL; + char *p = NULL; priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = sh->active_sinks; - - local->call_count = call_count; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s on child %d, offset %"PRId64"", - op_ret, local->loc.path, child_index, sh->offset); - - if (op_ret <= 0) { - afr_sh_data_trim_sinks (frame, this); - return 0; - } - - /* what if we read less than block size? */ - offset = sh->offset; - sh->offset += op_ret; - - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { - /* the iter function depends on the - sh->offset already being updated - above - */ - afr_sh_data_read_write_iter (frame, this); - goto out; - } - } + sinks_str = alloca0 (priv->child_count * 8); + p = sinks_str; for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) + if (!healed_sinks[i]) continue; - - /* this is a sink, so write to it */ - STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, offset, - iobref); - - if (!--call_count) - break; + p += sprintf (p, "%d ", i); } -out: - return 0; -} + gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. " + "source=%d sinks=%s", + uuid_utoa (fd->inode->gfid), source, sinks_str); + type = afr_data_self_heal_type_get (priv, healed_sinks, source, + replies); -int -afr_sh_data_read_write (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh->block_size, - sh->offset); + iter_frame = afr_copy_frame (frame); + if (!iter_frame) + return -ENOMEM; - return 0; -} - - -int -afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; + for (off = 0; off < replies[source].poststat.ia_size; off += block) { + ret = afr_selfheal_data_block (iter_frame, this, fd, source, + healed_sinks, off, block, type, + replies); + if (ret < 0) + goto out; - if (sh->op_failed) { - afr_sh_data_finish (frame, this); - goto out; + AFR_STACK_RESET (iter_frame); } - if (sh->offset >= sh->file_size) { - gf_log (this->name, GF_LOG_TRACE, - "closing fd's of %s", - local->loc.path); - afr_sh_data_trim_sinks (frame, this); + afr_selfheal_data_restore_time (frame, this, fd->inode, source, + healed_sinks, replies); - goto out; - } - - afr_sh_data_read_write (frame, this); + ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks); out: - return 0; + if (iter_frame) + AFR_STACK_DESTROY (iter_frame); + return ret; } -int -afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) +static int +__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this, + fd_t *fd, unsigned char *healed_sinks, + struct afr_reply *replies, uint64_t size) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *larger_sinks = 0; + int i = 0; local = frame->local; - sh = &local->self_heal; priv = this->private; - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, - "open of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } - + larger_sinks = alloca0 (priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && replies[i].poststat.ia_size > size) + larger_sinks[i] = 1; } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_data_finish (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - gf_log (this->name, GF_LOG_TRACE, - "sourcing file %s from %s to other sinks", - local->loc.path, priv->children[sh->source]->name); - - afr_sh_data_read_write (frame, this); - } + AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL); + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret == -1) + /* truncate() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; return 0; } - -int -afr_sh_data_open (call_frame_t *frame, xlator_t *this) +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can only happen if data was directly modified in the backend. + */ +static int +__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - int i = 0; - int call_count = 0; - + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; int source = -1; - int *sources = NULL; - - fd_t *fd = NULL; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; + int locked_count = 0; + int sources_count = 0; + int healed_sinks_count = 0; - local = frame->local; - sh = &local->self_heal; priv = this->private; - call_count = sh->active_sinks + 1; - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - source = local->self_heal.source; - sources = local->self_heal.sources; + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count); - sh->block_size = 65536; - sh->file_size = sh->buf[source].st_size; - - if (FILE_HAS_HOLES (&sh->buf[source])) - sh->file_has_holes = 1; - - /* open source */ - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->open, - &local->loc, O_RDONLY|O_LARGEFILE, fd); - call_count--; + if (locked_count == healed_sinks_count || !sources_count) { + /* split brain */ + return -EIO; + } - /* open sinks */ for (i = 0; i < priv->child_count; i++) { - if(sources[i] || !local->child_up[i]) + if (!sources[i]) continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - O_WRONLY|O_LARGEFILE, fd); - - if (!--call_count) - break; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + source = i; + } } - return 0; -} - - -int -afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; + if (!sources[i]) + continue; + if (replies[i].poststat.ia_size < size) { + sources[i] = 0; + sinks[i] = 1; } } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[source]->name, active_sinks); - - afr_sh_data_open (frame, this); - return 0; + return source; } - -int -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +/* + * __afr_selfheal_data_prepare: + * + * This function inspects the on-disk xattrs and determines which subvols + * are sources and sinks. + * + * The return value is the index of the subvolume to be used as the source + * for self-healing, or -1 if no healing is necessary/split brain. + */ +static int +__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; priv = this->private; - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - afr_sh_print_pending_matrix (sh->pending_matrix, this); - - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_DATA); - - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); - - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_data_finish (frame, this); - return 0; - } - - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal contents of '%s' (possible split-brain). " - "Please delete the file from all but the preferred " - "subvolume.", local->loc.path); - - local->govinda_gOvinda = 1; - - afr_sh_data_finish (frame, this); - return 0; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_data_finish (frame, this); - return 0; - } - - sh->source = source; - local->cont.lookup.buf.st_size = sh->buf[source].st_size; - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - afr_sh_data_sync_prepare (frame, this); - - return 0; + ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, + replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_DATA_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_data_finalize_source (this, sources, sinks, + healed_sinks, locked_on, + replies); + if (source < 0) + return -EIO; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return source; } -int -afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) +static int +__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + gf_boolean_t compat = _gf_false; + unsigned char *compat_lock = NULL; - local = frame->local; - sh = &local->self_heal; priv = this->private; - LOCK (&frame->lock); + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + compat_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); { - if (op_ret != -1) { - sh->xattr[child_index] = dict_ref (xattr); - sh->buf[child_index] = *buf; + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; } - } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock, + sources, sinks, healed_sinks, + locked_replies); + if (ret < 0) + goto unlock; + + source = ret; + + ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks, + locked_replies, + locked_replies[source].poststat.ia_size); + if (ret < 0) + goto unlock; + + ret = 0; + + /* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for + compatibility with older self-heal clients which do not + hold a lock in the @priv->sh_domain domain to guard + against concurrent ongoing self-heals + */ + afr_selfheal_inodelk (frame, this, fd->inode, this->name, + LLONG_MAX - 2, 1, compat_lock); + compat = _gf_true; + } +unlock: + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); + if (ret < 0) + goto out; - if (call_count == 0) { - afr_sh_data_fix (frame, this); - } + ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks, + locked_replies); + if (ret) + goto out; - return 0; + ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, + healed_sinks, AFR_DATA_TRANSACTION, + locked_replies, data_lock); +out: + if (compat) + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, + LLONG_MAX - 2, 1, compat_lock); + return ret; } -int -afr_sh_data_lookup (call_frame_t *frame, xlator_t *this) +static fd_t * +afr_selfheal_data_open (xlator_t *this, inode_t *inode) { - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - - int call_count = 0; - int i = 0; + loc_t loc = {0,}; int ret = 0; + fd_t *fd = NULL; - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = local->child_count; - - local->call_count = call_count; - - xattr_req = dict_new(); - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; -} + fd = fd_create (inode, 0); + if (!fd) + return NULL; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = (long) cookie; - - /* TODO: what if lock fails? */ - - local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - sh->op_failed = 1; - - gf_log (this->name, - GF_LOG_DEBUG, - "locking of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); - } + ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_data_finish (frame, this); - return 0; - } - - afr_sh_data_lookup (frame, this); - } + loc_wipe (&loc); - return 0; + return fd; } int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) { - struct flock flock; - int i = 0; - int call_count = 0; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; - + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + fd_t *fd = NULL; - local = frame->local; - sh = &local->self_heal; priv = this->private; - call_count = local->child_count; - - local->call_count = call_count; + fd = afr_selfheal_data_open (this, inode); + if (!fd) + return -EIO; - flock.l_start = 0; - flock.l_len = 0; - flock.l_type = F_WRLCK; + locked_on = alloca0 (priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "locking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, - &local->loc, F_SETLK, &flock); - if (!--call_count) - break; + ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; } - } - - return 0; -} - -int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - - - local = frame->local; - sh = &local->self_heal; - - if (local->need_data_self_heal && priv->data_self_heal) { - afr_sh_data_lock (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "not doing data self heal on %s", - local->loc.path); - afr_sh_data_done (frame, this); + ret = __afr_selfheal_data (frame, this, fd, locked_on); } +unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); - return 0; -} + if (fd) + fd_unref (fd); + return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index a5e698cd2..9e714b026 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,2100 +1,629 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include "afr-self-heal.h" #include "byte-order.h" - #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - - -int -afr_sh_entry_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - /* - TODO: cleanup sh->* - */ - - gf_log (this->name, GF_LOG_TRACE, - "self heal of %s completed", - local->loc.path); - - sh->completion_cbk (frame, this); - - return 0; -} - - -int -afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = (long) cookie; - - /* TODO: what if lock fails? */ - - local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "unlocked inode of %s on child %d", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->healing_fd) - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - afr_sh_entry_done (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - int call_count = 0; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = local->child_count; - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "unlocking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &local->loc, NULL, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "finishing entry selfheal of %s", local->loc.path); - - afr_sh_entry_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_finish (frame, this); - - return 0; -} - - -int -afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_ENTRY_TRANSACTION); - - erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - FREE (erase_xattr); - - return 0; -} - static int -next_active_source (call_frame_t *frame, xlator_t *this, - int current_active_source) +afr_selfheal_entry_delete (call_frame_t *frame, xlator_t *this, inode_t *dir, + const char *name, inode_t *inode, int child, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int source = -1; - int next_active_source = -1; - int i = 0; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = {0, }; + char g[64]; priv = this->private; - local = frame->local; - sh = &local->self_heal; - source = sh->source; + subvol = priv->children[child]; - if (source != -1) { - if (current_active_source != source) - next_active_source = source; - goto out; - } - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_source)) { + loc.parent = inode_ref (dir); + uuid_copy (loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref (inode); - next_active_source = i; + if (replies[child].valid && replies[child].op_ret == 0) { + switch (replies[child].poststat.ia_type) { + case IA_IFDIR: + gf_log (this->name, GF_LOG_WARNING, + "expunging dir %s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir (subvol, &loc, 1); break; - } - } -out: - return next_active_source; -} - - - -static int -next_active_sink (call_frame_t *frame, xlator_t *this, - int current_active_sink) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int next_active_sink = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_sink)) { - - next_active_sink = i; + default: + gf_log (this->name, GF_LOG_WARNING, + "expunging file %s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink (subvol, &loc); break; } } - return next_active_sink; -} - - -int -build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) -{ - int ret = -1; - - if (!child) { - goto out; - } - - if (strcmp (parent->path, "/") == 0) - asprintf ((char **)&child->path, "/%s", name); - else - asprintf ((char **)&child->path, "%s/%s", parent->path, name); - - if (!child->path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; - - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); - - if (!child->inode) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ret = 0; -out: - if (ret == -1) - loc_wipe (child); + loc_wipe (&loc); return ret; } int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int -afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src) +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, + int source, inode_t *dir, const char *name, + inode_t *inode, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_expunge_subvol (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - call_frame_t *frame = NULL; - - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - - active_src = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "removed %s on %s", - expunge_local->loc.path, - priv->children[active_src]->name); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "removing %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; + int ret = 0; + loc_t loc = {0,}; + loc_t srcloc = {0,}; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + struct iatt *iatt = NULL; + char *linkname = NULL; + mode_t mode = 0; + struct iatt newent = {0,}; priv = this->private; - expunge_local = expunge_frame->local; - gf_log (this->name, GF_LOG_TRACE, - "removing directory %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); + xdata = dict_new(); + if (!xdata) + return -ENOMEM; - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->rmdir, - &expunge_local->loc); + loc.parent = inode_ref (dir); + uuid_copy (loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref (inode); - return 0; -} - - -int -afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; + ret = afr_selfheal_entry_delete (frame, this, dir, name, inode, dst, + replies); + if (ret) + goto out; - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "unlinking file %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->unlink, - &expunge_local->loc); - - return 0; -} + ret = dict_set_static_bin (xdata, "gfid-req", + replies[source].poststat.ia_gfid, 16); + if (ret) + goto out; + iatt = &replies[source].poststat; -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct stat *buf) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int type = 0; + srcloc.inode = inode_ref (inode); + uuid_copy (srcloc.gfid, iatt->ia_gfid); - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - source = expunge_sh->source; - - type = (buf->st_mode & S_IFMT); - - switch (type) { - case S_IFSOCK: - case S_IFREG: - case S_IFBLK: - case S_IFCHR: - case S_IFIFO: - case S_IFLNK: - afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); + mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type); + switch (iatt->ia_type) { + case IA_IFDIR: + ret = syncop_mkdir (priv->children[dst], &loc, mode, xdata, 0); break; - case S_IFDIR: - afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); + case IA_IFLNK: + ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == 0) { + ret = syncop_link (priv->children[dst], &srcloc, &loc); + } else { + ret = syncop_readlink (priv->children[source], &srcloc, + &linkname, 4096); + if (ret <= 0) + goto out; + ret = syncop_symlink (priv->children[dst], &loc, linkname, + xdata, NULL); + } break; default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - expunge_local->loc.path, - priv->children[source]->name, type); - goto out; + ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) + goto out; + ret = syncop_mknod (priv->children[dst], &loc, mode, + iatt->ia_rdev, xdata, &newent); + if (ret == 0 && iatt->ia_size && !newent.ia_size) { + /* New entry created. Mark @dst pending on all sources */ + ret = 1; + } break; } - return 0; out: - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; + if (xdata) + dict_unref (xdata); + loc_wipe (&loc); + loc_wipe (&srcloc); + return ret; } -int -afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *x) +static int +afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int **changelog = NULL; + int idx = 0; priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, - "lookup of %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - return 0; -out: - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; -} + uuid_copy (inode->gfid, replies[source].poststat.ia_gfid); + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); -int -afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &expunge_local->loc, 0); - - return 0; -} - - -int -afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *x) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int active_src = 0; - + xattr = dict_new(); + if (!xattr) + return -ENOMEM; - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = expunge_sh->active_source; - source = (long) cookie; - - if (op_ret == -1 && op_errno == ENOENT) { - - gf_log (this->name, GF_LOG_TRACE, - "missing entry %s on %s", - expunge_local->loc.path, - priv->children[source]->name); - - afr_sh_entry_expunge_purge (expunge_frame, this, active_src); - - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!newentry[i]) + continue; + changelog[i][idx] = hton32(1); } - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - expunge_local->loc.path, - priv->children[source]->name); - } else { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s under %s failed (%s)", - expunge_local->loc.path, - priv->children[source]->name, - strerror (op_errno)); - } + afr_set_pending_dict (priv, xattr, changelog); - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + afr_selfheal_post_op (frame, this, inode, i, xattr); + } - return 0; + dict_unref (xattr); + return ret; } -int -afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, - char *name) +static int +__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *expunge_frame = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int source = 0; - int op_errno = 0; + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + unsigned char *newentry = NULL; priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - source = sh->source; + newentry = alloca0 (priv->child_count); - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0)) { - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); - goto out; - } + if (!replies[source].valid) + return -EIO; - gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", - name, local->loc.path); - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); - - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - expunge_sh->active_source = active_src; - - ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); - if (ret != 0) { - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", expunge_local->loc.path, - priv->children[source]->name); - - STACK_WIND_COOKIE (expunge_frame, - afr_sh_entry_expunge_entry_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->lookup, - &expunge_local->loc, 0); - - ret = 0; -out: - if (ret == -1) - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + if (replies[source].op_ret == -1 && + replies[source].op_errno == ENOENT) { + ret = afr_selfheal_entry_delete (frame, this, fd->inode, + name, inode, i, replies); } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); + if (!uuid_compare (replies[i].poststat.ia_gfid, + replies[source].poststat.ia_gfid)) + continue; + + ret = afr_selfheal_recreate_entry (frame, this, i, source, + fd->inode, name, inode, + replies); + if (ret > 0) { + newentry[i] = 1; + ret = 0; + } } - - afr_sh_entry_expunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_expunge_entry (frame, this, entry->d_name); - } - - return 0; -} - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdir, - sh->healing_fd, sh->block_size, sh->offset); - - return 0; -} - - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->offset = 0; - - if (sh->source == -1) { - gf_log (this->name, GF_LOG_TRACE, - "no active sources for %s to expunge entries", - local->loc.path); - goto out; - } - - active_src = next_active_sink (frame, this, sh->active_source); - sh->active_source = active_src; - - if (sh->op_failed) { - goto out; - } - - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "expunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); - - afr_sh_entry_expunge_subvol (frame, this, active_src); - - return 0; -out: - afr_sh_entry_erase_pending (frame, this); - return 0; - -} - - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int -afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { + if (ret < 0) + break; } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this, active_src); - - return 0; + if (AFR_COUNT (newentry, priv->child_count)) + afr_selfheal_newentry_mark (frame, this, inode, source, replies, + sources, newentry); + return ret; } -int -afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, struct stat *stbuf) +static int +__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, unsigned char *sources, + unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + int source = -1; priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; - child_index = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "utimes set for %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "setting utimes of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; -} - - -int -afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, struct stat *stbuf) -{ - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; - struct timespec ts[2]; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - child_index = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "ownership of %s on %s changed", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "setting ownership of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - -#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC - ts[0] = impunge_local->cont.lookup.buf.st_atim; - ts[1] = impunge_local->cont.lookup.buf.st_mtim; -#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC - ts[0] = impunge_local->cont.lookup.buf.st_atimespec; - ts[1] = impunge_local->cont.lookup.buf.st_mtimespec; -#else - ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime; - ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime; -#endif - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_utimens_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->utimens, - &impunge_local->loc, ts); - - return 0; - -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + source = i; + break; + } } - return 0; -} - - -int -afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int child_index = 0; - - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - - child_index = (long) cookie; - - gf_log (this->name, GF_LOG_TRACE, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_local->cont.lookup.buf.st_uid, - impunge_local->cont.lookup.buf.st_gid); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->chown, - &impunge_local->loc, - impunge_local->cont.lookup.buf.st_uid, - impunge_local->cont.lookup.buf.st_gid); - return 0; -} - - -int -afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *stbuf) -{ - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; - int pending_array[3] = {0, }; - dict_t *xattr = NULL; - int ret = 0; - int idx = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "creation of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; + if (source == -1) { + /* entry got deleted in the mean time? */ + return 0; } - inode->st_mode = stbuf->st_mode; - - xattr = get_new_dict (); - dict_ref (xattr); - - idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - pending_array[idx] = hton32 (1); - if (S_ISDIR (stbuf->st_mode)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - pending_array[idx] = hton32 (1); - - ret = dict_set_static_bin (xattr, priv->pending_key[child_index], - pending_array, sizeof (pending_array)); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->xattrop, - &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr); - - dict_unref (xattr); - - return 0; + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + if (replies[i].op_errno != ENOENT) + continue; - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); + ret = afr_selfheal_recreate_entry (frame, this, i, source, + fd->inode, name, inode, + replies); } - return 0; -} - - -int -afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct stat *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing file %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mknod, - &impunge_local->loc, - stbuf->st_mode, stbuf->st_rdev); - - return 0; + return ret; } - -int -afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct stat *stbuf) +static int +__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - + int ret = -1; - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing directory %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mkdir, - &impunge_local->loc, stbuf->st_mode); - - return 0; + if (source < 0) + ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode, + sources, healed_sinks, + locked_on, replies); + else + ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode, + source, sources, healed_sinks, + locked_on, replies); + return ret; } -int -afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, const char *linkname) +static int +afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *sources, + unsigned char *healed_sinks, char *name) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - + afr_private_t *priv = NULL; + int ret = 0; + unsigned char *locked_on = NULL; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - gf_log (this->name, GF_LOG_DEBUG, - "creating missing symlink %s -> %s on %s", - impunge_local->loc.path, linkname, - priv->children[child_index]->name); + locked_on = alloca0 (priv->child_count); - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc); + replies = alloca0 (priv->child_count * sizeof(*replies)); - return 0; -} - - -int -afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, - linkname); - return 0; - -out: - LOCK (&impunge_frame->lock); + ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, + name, locked_on); { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; -} - - -int -afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct stat *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096); - - return 0; -} - - -int -afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, - dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; - int type = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int call_count = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - - child_index = (long) cookie; - - active_src = impunge_sh->active_source; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s (for %s) failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - impunge_local->cont.lookup.buf = *buf; - type = (buf->st_mode & S_IFMT); - - switch (type) { - case S_IFSOCK: - case S_IFREG: - case S_IFBLK: - case S_IFCHR: - case S_IFIFO: - afr_sh_entry_impunge_mknod (impunge_frame, this, - child_index, buf); - break; - case S_IFLNK: - afr_sh_entry_impunge_readlink (impunge_frame, this, - child_index, buf); - break; - case S_IFDIR: - afr_sh_entry_impunge_mkdir (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - goto out; - break; - } - - return 0; + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name, + replies, locked_on); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); + ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode, + source, sources, healed_sinks, + locked_on, replies); } - - return 0; +unlock: + afr_selfheal_unentrylk (frame, this, fd->inode, this->name, name, + locked_on); + if (inode) + inode_unref (inode); + return ret; } -int -afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +static int +afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, fd_t *fd, + int child, int source, unsigned char *sources, + unsigned char *healed_sinks) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; - + int ret = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + off_t offset = 0; + call_frame_t *iter_frame = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; + subvol = priv->children[child]; - active_src = impunge_sh->active_source; + INIT_LIST_HEAD (&entries.list); - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_recreate_lookup_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &impunge_local->loc, 0); + iter_frame = afr_copy_frame (frame); + if (!iter_frame) + return -ENOMEM; - return 0; -} + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; -int -afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *x) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int call_count = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int active_src = 0; + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) + continue; - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - child_index = (long) cookie; - active_src = impunge_sh->active_source; - - if (op_ret == -1 && op_errno == ENOENT) { - /* decrease call_count in recreate-callback */ - gf_log (this->name, GF_LOG_TRACE, - "missing entry %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - afr_sh_entry_impunge_recreate (impunge_frame, this, - child_index); - return 0; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s under %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } + ret = afr_selfheal_entry_dirent (iter_frame, this, fd, + source, sources, + healed_sinks, + entry->d_name); + AFR_STACK_RESET (iter_frame); - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + if (ret) + break; + } - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); + gf_dirent_free (&entries); + if (ret) + break; } - return 0; + AFR_STACK_DESTROY (iter_frame); + return ret; } - -int -afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, - char *name) +static int +afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *sources, + unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *impunge_frame = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; - int i = 0; - int call_count = 0; - int op_errno = 0; + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0)) { - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", - name, local->loc.path); - - impunge_frame = copy_frame (frame); - if (!impunge_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (impunge_local, afr_local_t, out); - - impunge_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = active_src; - - ret = build_child_loc (this, &impunge_local->loc, &local->loc, name); - if (ret != 0) { - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (i == active_src) - continue; - if (local->child_up[i] == 0) - continue; - if (sh->sources[i] == 1) - continue; - call_count++; - } - - impunge_local->call_count = call_count; + gf_log (this->name, GF_LOG_INFO, "performing entry selfheal on %s", + uuid_utoa (fd->inode->gfid)); for (i = 0; i < priv->child_count; i++) { - if (i == active_src) - continue; - if (local->child_up[i] == 0) - continue; - if (sh->sources[i] == 1) + if (i != source && !healed_sinks[i]) continue; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", impunge_local->loc.path, - priv->children[i]->name); - - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_entry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &impunge_local->loc, 0); - - if (!--call_count) + ret = afr_selfheal_entry_do_subvol (frame, this, fd, i, source, + sources, healed_sinks); + if (ret) break; } - - ret = 0; -out: - if (ret == -1) - afr_sh_entry_impunge_entry_done (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_impunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_impunge_entry (frame, this, entry->d_name); - } - - return 0; -} - - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdir, - sh->healing_fd, sh->block_size, sh->offset); - - return 0; -} - - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->offset = 0; - - active_src = next_active_source (frame, this, sh->active_source); - sh->active_source = active_src; - - if (sh->op_failed) { - afr_sh_entry_finish (frame, this); - return 0; - } - - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - afr_sh_entry_expunge_all (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "impunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); - - afr_sh_entry_impunge_subvol (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "opendir of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_entry_finish (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - sh->active_source = -1; - afr_sh_entry_impunge_all (frame, this); - } - - return 0; + return ret; } -int -afr_sh_entry_open (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - int i = 0; - int call_count = 0; - + int i = 0; + afr_private_t *priv = NULL; int source = -1; - int *sources = NULL; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; - fd_t *fd = NULL; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; priv = this->private; - source = local->self_heal.source; - sources = local->self_heal.sources; - - sh->block_size = 131072; - sh->offset = 0; - - call_count = sh->active_sinks; - if (source != -1) - call_count++; + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - if (source != -1) { - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (source)", - local->loc.path, priv->children[source]->name); - - /* open source */ - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->opendir, - &local->loc, fd); - call_count--; + if (locked_count == sinks_count || !sources_count) { + return -1; } - /* open sinks */ for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (sink)", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - &local->loc, fd); - - if (!--call_count) + if (sources[i]) { + source = i; break; - } - - return 0; -} - - -int -afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; } } - if (source != -1) - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for self-heal on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - if (source == -1 && active_sinks < 2) { - gf_log (this->name, GF_LOG_TRACE, - "cannot sync with 0 sources and 1 sink on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - if (source != -1) - gf_log (this->name, GF_LOG_DEBUG, - "self-healing directory %s from subvolume %s to " - "%d other", - local->loc.path, priv->children[source]->name, - active_sinks); - else - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s found. " - "merging all entries as a conservative decision", - local->loc.path); - - afr_sh_entry_open (frame, this); - return 0; + return source; } -int -afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies, int *source_p) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; priv = this->private; - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); - - afr_sh_print_pending_matrix (sh->pending_matrix, this); + ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, + replies); + if (ret) + return ret; - afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_ENTRY); + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_ENTRY_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); + source = __afr_selfheal_entry_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; - source = afr_sh_select_source (sh->sources, priv->child_count); - sh->source = source; + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). - afr_sh_entry_sync_prepare (frame, this); + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; - return 0; + return ret; } +static int +__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; -int -afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + priv = this->private; - int call_count = -1; - int child_index = (long) cookie; + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); - local = frame->local; - sh = &local->self_heal; - priv = this->private; + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); - LOCK (&frame->lock); + ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL, + data_lock); { - if (op_ret != -1) { - sh->xattr[child_index] = dict_ref (xattr); - sh->buf[child_index] = *buf; + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_sh_entry_fix (frame, this); + ret = __afr_selfheal_entry_prepare (frame, this, fd, data_lock, + sources, sinks, healed_sinks, + locked_replies, &source); } +unlock: + afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, + data_lock); + if (ret < 0) + goto out; - return 0; -} - - - -int -afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t * sh = NULL; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - dict_t *xattr_req = NULL; - int ret = 0; - int call_count = 0; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = local->child_count; - - local->call_count = call_count; - - xattr_req = dict_new(); - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, - priv->pending_key[i], - 3 * sizeof(int32_t)); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, - afr_sh_entry_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); + ret = afr_selfheal_entry_do (frame, this, fd, source, sources, + healed_sinks, locked_replies); + if (ret) + goto out; - return 0; + ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, + healed_sinks, AFR_ENTRY_TRANSACTION, + locked_replies, data_lock); +out: + return ret; } - -int -afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static fd_t * +afr_selfheal_data_opendir (xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = (long) cookie; - - /* TODO: what if lock fails? */ - - local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - sh->op_failed = 1; - - gf_log (this->name, GF_LOG_DEBUG, - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); + loc_t loc = {0,}; + int ret = 0; + fd_t *fd = NULL; - call_count = afr_frame_return (frame); + fd = fd_create (inode, 0); + if (!fd) + return NULL; - if (call_count == 0) { - if (sh->op_failed == 1) { - afr_sh_entry_finish (frame, this); - return 0; - } + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - afr_sh_entry_lookup (frame, this); + ret = syncop_opendir (this, &loc, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); } - return 0; + loc_wipe (&loc); + return fd; } int -afr_sh_entry_lock (call_frame_t *frame, xlator_t *this) +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode) { - int i = 0; - int call_count = 0; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; - + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + fd_t *fd = NULL; + int ret = 0; - local = frame->local; - sh = &local->self_heal; priv = this->private; - call_count = local->child_count; + fd = afr_selfheal_data_opendir (this, inode); + if (!fd) + return -EIO; - local->call_count = call_count; + locked_on = alloca0 (priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "locking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &local->loc, NULL, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); - if (!--call_count) - break; + ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; } - } - - return 0; -} - -int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - if (local->need_entry_self_heal && priv->entry_self_heal) { - afr_sh_entry_lock (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to completion on %s", - local->loc.path); - afr_sh_entry_done (frame, this); + ret = __afr_selfheal_entry (frame, this, fd, locked_on); } +unlock: + afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); - return 0; -} + if (fd) + fd_unref (fd); + return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 966b754b3..83628297f 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,806 +1,280 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - -int -afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); - memset (sh->success, 0, sizeof (int) * priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_DEBUG, - "aborting selfheal of %s", - local->loc.path); - sh->completion_cbk (frame, this); - } else { - if (S_ISREG (local->cont.lookup.buf.st_mode)) { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to data check on %s", - local->loc.path); - afr_self_heal_data (frame, this); - return 0; - } - - if (S_ISDIR (local->cont.lookup.buf.st_mode)) { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_DEBUG, - "completed self heal of %s", - local->loc.path); - - sh->completion_cbk (frame, this); - } - - return 0; -} - - -int -afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - int call_count = 0; - - - local = frame->local; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_done (frame, this); - - return 0; -} - - -int -afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; - struct flock flock = {0, }; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = local->child_count; - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - flock.l_start = 0; - flock.l_len = 0; - flock.l_type = F_UNLCK; - - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "unlocking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND (frame, afr_sh_metadata_unlck_cbk, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, - &local->loc, F_SETLK, &flock); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_finish (frame, this); - - return 0; -} +#include "byte-order.h" +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE) int -afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - + int ret = -1; + loc_t loc = {0,}; + dict_t *xattr = NULL; + dict_t *old_xattr = NULL; + afr_private_t *priv = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; priv = this->private; - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, - sh->success, priv->child_count, - AFR_METADATA_TRANSACTION); + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; + gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s", + uuid_utoa (inode->gfid)); - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } + ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL); + if (ret < 0) { + loc_wipe (&loc); + return -EIO; } - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_METADATA_TRANSACTION); - - local->call_count = call_count; - - if (call_count == 0) { - gf_log (this->name, GF_LOG_WARNING, - "metadata of %s not healed on any subvolume", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - } + afr_filter_xattrs (xattr); + dict_del (xattr, GF_SELINUX_XATTR_KEY); for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) + if (!healed_sinks[i]) continue; - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); + ret = syncop_setattr (priv->children[i], &loc, + &locked_replies[source].poststat, + AFR_HEAL_ATTR, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + + old_xattr = NULL; + ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0); + if (old_xattr) { + dict_del (old_xattr, GF_SELINUX_XATTR_KEY); + afr_filter_xattrs (old_xattr); + ret = syncop_removexattr (priv->children[i], &loc, "", + old_xattr); } - } - FREE (erase_xattr); - - return 0; -} - - -int -afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "setting attributes failed for %s on %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->success[child_index] = 0; - } + ret = syncop_setxattr (priv->children[i], &loc, xattr, 0); + if (ret) + healed_sinks[i] = 0; } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_erase_pending (frame, this); - - return 0; -} - - -int -afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int active_sinks = 0; - int call_count = 0; - int i = 0; - struct timespec ts[2]; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - source = sh->source; - active_sinks = sh->active_sinks; - - /* - * 4 calls per sink - chown, chmod, utimes, setxattr - */ + loc_wipe (&loc); if (xattr) - call_count = active_sinks * 4; - else - call_count = active_sinks * 3; - - local->call_count = call_count; - -#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC - ts[0] = sh->buf[source].st_atim; - ts[1] = sh->buf[source].st_mtim; -#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC - ts[0] = sh->buf[source].st_atimespec; - ts[1] = sh->buf[source].st_mtimespec; -#else - ts[0].tv_sec = sh->buf[source].st_atime; - ts[1].tv_sec = sh->buf[source].st_mtime; -#endif - - for (i = 0; i < priv->child_count; i++) { - if (call_count == 0) { - break; - } - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing metadata of %s from %s to %s", - local->loc.path, priv->children[source]->name, - priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->chown, - &local->loc, - sh->buf[source].st_uid, - sh->buf[source].st_gid); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->chmod, - &local->loc, sh->buf[source].st_mode); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->utimens, - &local->loc, ts); - - call_count = call_count - 3; - - if (!xattr) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, xattr, 0); - call_count--; - } + dict_unref (xattr); return 0; } -int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +/* + * Look for mismatching uid/gid or mode even if xattrs don't say so, and + * pick one arbitrarily as winner. + */ + +static int +__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; + int i = 0; + afr_private_t *priv = NULL; + struct iatt first = {0, }; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; - int i; - - local = frame->local; - sh = &local->self_heal; priv = this->private; - source = sh->source; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", - local->loc.path, priv->children[source]->name, - strerror (op_errno)); + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); - afr_sh_metadata_sync (frame, this, NULL); - } else { - for (i = 0; i < priv->child_count; i++) { - dict_del (xattr, priv->pending_key[i]); - } - - afr_sh_metadata_sync (frame, this, xattr); + if (locked_count == sinks_count || !sources_count) { + if (!priv->metadata_splitbrain_forced_heal) { + return -EIO; + } + /* Metadata split brain, select one subvol + arbitrarily */ + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i] && sinks[i]) { + sources[i] = 1; + sinks[i] = 0; + break; + } + } } - return 0; -} - - -int -afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; + if (!sources[i]) + continue; + if (source == -1) { + source = i; + first = replies[i].poststat; } } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_metadata_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - gf_log (this->name, GF_LOG_TRACE, - "syncing metadata of %s from subvolume %s to %d active sinks", - local->loc.path, priv->children[source]->name, active_sinks); - - STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, - priv->children[source], - priv->children[source]->fops->getxattr, - &local->loc, NULL); - - return 0; -} - - -int -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, - AFR_METADATA_TRANSACTION); - - afr_sh_print_pending_matrix (sh->pending_matrix, this); - - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_METADATA); - - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); - - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - return 0; - } - - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_WARNING, - "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal permissions/ownership of '%s' " - "(possible split-brain). Please fix the file on " - "all backend volumes", local->loc.path); - - local->govinda_gOvinda = 1; - - afr_sh_metadata_finish (frame, this); - return 0; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_metadata_finish (frame, this); - return 0; - } - sh->source = source; - - /* detect changes not visible through pending flags -- JIC */ for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) + if (!sources[i]) continue; - - if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - - if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; + if (!IA_EQUAL (first, replies[i].poststat, type) || + !IA_EQUAL (first, replies[i].poststat, uid) || + !IA_EQUAL (first, replies[i].poststat, gid) || + !IA_EQUAL (first, replies[i].poststat, prot)) { + sources[i] = 0; + sinks[i] = 1; + } } - afr_sh_metadata_sync_prepare (frame, this); - - return 0; + return source; } -int -afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) +static int +__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; - - local = frame->local; - sh = &local->self_heal; priv = this->private; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->st_mode); - - sh->buf[child_index] = *buf; - if (xattr) - sh->xattr[child_index] = dict_ref (xattr); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_fix (frame, this); - - return 0; + ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, + replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_METADATA_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_metadata_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) + return -EIO; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return source; } -int -afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; - dict_t *xattr_req = NULL; - int ret = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = local->child_count; - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, - priv->pending_key[i], - 3 * sizeof(int32_t)); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; - return 0; -} + priv = this->private; + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); -int -afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; - - /* TODO: what if lock fails? */ - - local = frame->local; - sh = &local->self_heal; - priv = this->private; + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); - LOCK (&frame->lock); + ret = afr_selfheal_inodelk (frame, this, inode, this->name, + LLONG_MAX - 1, 0, data_lock); { - if (op_ret == -1) { - sh->op_failed = 1; - - gf_log (this->name, GF_LOG_DEBUG, - "locking of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; } - } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock, + sources, sinks, healed_sinks, + locked_replies); + if (ret < 0) + goto unlock; - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_metadata_finish (frame, this); - return 0; - } - - afr_sh_metadata_lookup (frame, this); + source = ret; + ret = 0; } - - return 0; +unlock: + afr_selfheal_uninodelk (frame, this, inode, this->name, + LLONG_MAX -1, 0, data_lock); + if (ret < 0) + goto out; + + ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks, + locked_replies); + if (ret) + goto out; + + ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks, + healed_sinks, AFR_METADATA_TRANSACTION, + locked_replies, data_lock); +out: + return ret; } int -afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; - struct flock flock = {0, }; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; - - local = frame->local; - sh = &local->self_heal; priv = this->private; - call_count = local->child_count; - local->call_count = call_count; + locked_on = alloca0 (priv->child_count); - for (i = 0; i < priv->child_count; i++) { - flock.l_start = 0; - flock.l_len = 0; - flock.l_type = F_WRLCK; - - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "locking %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, - &local->loc, F_SETLK, &flock); - - if (!--call_count) - break; + ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; } - } - return 0; -} - - -int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - - - local = frame->local; - sh = &local->self_heal; - - if (local->need_metadata_self_heal && priv->metadata_self_heal) { - afr_sh_metadata_lock (frame, this); - } else { - afr_sh_metadata_done (frame, this); + ret = __afr_selfheal_metadata (frame, this, inode, locked_on); } +unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); - return 0; + return ret; } - diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c new file mode 100644 index 000000000..ce80b8da3 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -0,0 +1,457 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr.h" +#include "afr-self-heal.h" + + +int +__afr_selfheal_assign_gfid (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies, int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + int ret = 0; + loc_t loc = {0, }; + + priv = this->private; + + uuid_copy (parent->gfid, pargfid); + + xdata = dict_new (); + if (!xdata) { + return -ENOMEM; + } + + ret = dict_set_static_bin (xdata, "gfid-req", + replies[gfid_idx].poststat.ia_gfid, 16); + if (ret) { + dict_destroy (xdata); + return -ENOMEM; + } + + loc.parent = inode_ref (parent); + loc.inode = inode_ref (inode); + uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].op_ret == 0 || replies[i].op_errno != ENODATA) + continue; + + ret = syncop_lookup (priv->children[i], &loc, xdata, 0, 0, 0); + } + + loc_wipe (&loc); + dict_unref (xdata); + + return ret; +} + + +int +__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies, int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + uuid_copy (parent->gfid, pargfid); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[gfid_idx].poststat.ia_gfid) == 0) + continue; + + ret |= afr_selfheal_recreate_entry (frame, this, i, gfid_idx, + parent, bname, inode, replies); + } + + return ret; +} + + +int +__afr_selfheal_name_expunge (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies) +{ + loc_t loc = {0, }; + int i = 0; + afr_private_t *priv = NULL; + char g[64]; + int ret = 0; + + priv = this->private; + + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + loc.inode = inode_ref (inode); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret) + continue; + + switch (replies[i].poststat.ia_type) { + case IA_IFDIR: + gf_log (this->name, GF_LOG_WARNING, + "expunging dir %s/%s (%s) on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g), + priv->children[i]->name); + ret |= syncop_rmdir (priv->children[i], &loc, 1); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "expunging file %s/%s (%s) on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g), + priv->children[i]->name); + ret |= syncop_unlink (priv->children[i], &loc); + break; + } + } + + loc_wipe (&loc); + + return ret; + +} + + +int +__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, int source, + unsigned char *locked_on, struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uuid_t gfid = {0, }; + int gfid_idx = -1; + gf_boolean_t source_is_empty = _gf_true; + gf_boolean_t need_heal = _gf_false; + int first_idx = -1; + char g1[64],g2[64]; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + need_heal = _gf_true; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + need_heal = _gf_true; + } + + if (!need_heal) + return 0; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (!replies[i].op_ret && (source == -1 || sources[i])) { + source_is_empty = _gf_false; + break; + } + } + + if (source_is_empty) { + return __afr_selfheal_name_expunge (frame, this, parent, pargfid, + bname, inode, replies); + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (uuid_is_null (replies[i].poststat.ia_gfid)) + continue; + + if (uuid_is_null (gfid)) { + uuid_copy (gfid, replies[i].poststat.ia_gfid); + gfid_idx = i; + continue; + } + + if (sources[i] || source == -1) { + if (gfid_idx != -1 && + (sources[gfid_idx] || source == -1) && + uuid_compare (gfid, replies[i].poststat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "GFID mismatch for <gfid:%s>/%s " + "%s on %s and %s on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g1), + priv->children[i]->name, + uuid_utoa_r (replies[gfid_idx].poststat.ia_gfid, g2), + priv->children[gfid_idx]->name); + return -1; + } + + uuid_copy (gfid, replies[i].poststat.ia_gfid); + gfid_idx = i; + continue; + } + } + + if (gfid_idx == -1) + return -1; + + __afr_selfheal_assign_gfid (frame, this, parent, pargfid, bname, inode, + replies, gfid_idx); + + return __afr_selfheal_name_impunge (frame, this, parent, pargfid, + bname, inode, replies, gfid_idx); +} + + +int +__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *locked_on, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); + + if (locked_count == sinks_count || !sources_count) { + return -1; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } + + return source; +} + + +int +__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, struct afr_reply *replies, + int *source_p) +{ + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_ENTRY_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_name_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return ret; +} + + +int +afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname) +{ + afr_private_t *priv = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *locked_on = NULL; + int source = -1; + struct afr_reply *replies = NULL; + int ret = -1; + inode_t *inode = NULL; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + + replies = alloca0 (priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname, + locked_on); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid, + locked_on, sources, sinks, + healed_sinks, replies, + &source); + if (ret) + goto unlock; + + inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, + replies, locked_on); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_name_do (frame, this, parent, pargfid, bname, + inode, sources, sinks, healed_sinks, + source, locked_on, replies); + } +unlock: + afr_selfheal_unentrylk (frame, this, parent, this->name, bname, + locked_on); + if (inode) + inode_unref (inode); + + return ret; +} + + +int +afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, + const char *bname, gf_boolean_t *need_heal) +{ + afr_private_t *priv = NULL; + int i = 0; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + int first_idx = -1; + + priv = this->private; + + replies = alloca0 (sizeof (*replies) * priv->child_count); + + inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, + replies, priv->child_up); + if (!inode) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + *need_heal = _gf_true; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + *need_heal = _gf_true; + } + + if (inode) + inode_unref (inode); + return 0; +} + +int +afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname) +{ + inode_t *parent = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t need_heal = _gf_false; + + parent = afr_inode_find (this, pargfid); + if (!parent) + goto out; + + frame = afr_frame_create (this); + if (!frame) + goto out; + + ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid, + bname, &need_heal); + if (ret) + goto out; + + if (need_heal) + afr_selfheal_name_do (frame, this, parent, pargfid, bname); +out: + if (parent) + inode_unref (parent); + if (frame) + AFR_STACK_DESTROY (frame); + + return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 73abd8fb9..a1b972ac3 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,52 +1,167 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEAL_H__ -#define __AFR_SELF_HEAL_H__ -#include <sys/stat.h> +#ifndef _AFR_SELFHEAL_H +#define _AFR_SELFHEAL_H + + +/* Perform fop on all UP subvolumes and wait for all callbacks to return */ + +#define AFR_ONALL(frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + __count++; \ + } \ + syncbarrier_wait (&__local->barrier, __count); \ + } while (0) + + +/* Perform fop on all subvolumes represented by list[] array and wait + for all callbacks to return */ + +#define AFR_ONLIST(list, frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!list[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + __count++; \ + } \ + syncbarrier_wait (&__local->barrier, __count); \ + } while (0) + + +#define AFR_SEQ(frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + syncbarrier_wait (&__local->barrier, 1); \ + } \ + } while (0) + + +#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \ + int __i; \ + __ptr = alloca0 (n * sizeof(type *)); \ + for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \ + __ptr;}) + + +#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0) + + +int +afr_selfheal (xlator_t *this, uuid_t gfid); + +int +afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name); + +int +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode); -#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode)) -#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode)) -#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid))) -#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size)) -#define SIZE_GREATER(buf1,buf2) ((((struct stat *)buf1)->st_size > (((struct stat *)buf2)->st_size))) +int +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); + int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on); + int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this); +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); int -afr_self_heal_data (call_frame_t *frame, xlator_t *this); +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies); + +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on); int -afr_self_heal (call_frame_t *frame, xlator_t *this, - int (*completion_cbk) (call_frame_t *, xlator_t *)); +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks); + +int +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix); + +int +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on); + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, + int source, inode_t *dir, const char *name, + inode_t *inode, struct afr_reply *replies); + +int +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr); + +call_frame_t * +afr_frame_create (xlator_t *this); + +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid); -#endif /* __AFR_SELF_HEAL_H__ */ +#endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c new file mode 100644 index 000000000..4bfe909bc --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -0,0 +1,1256 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr.h" +#include "afr-self-heal.h" +#include "afr-self-heald.h" +#include "protocol-common.h" + +#define SHD_INODE_LRU_LIMIT 2048 +#define AFR_EH_HEALED_LIMIT 1024 +#define AFR_EH_HEAL_FAIL_LIMIT 1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 +#define AFR_STATISTICS_HISTORY_SIZE 50 + + +#define ASSERT_LOCAL(this, healer) \ + if (!afr_shd_is_subvol_local(this, healer->subvol)) { \ + healer->local = _gf_false; \ + if (safe_break (healer)) { \ + break; \ + } else { \ + continue; \ + } \ + } else { \ + healer->local = _gf_true; \ + } + + +#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n]) +#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n]) + +int afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p); + +char * +afr_subvol_name (xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + + priv = this->private; + if (subvol < 0 || subvol > priv->child_count) + return NULL; + + return priv->children[subvol]->name; +} + + +void +afr_destroy_crawl_event_data (void *data) +{ + return; +} + + +void +afr_destroy_shd_event_data (void *data) +{ + shd_event_t *shd_event = data; + + if (!shd_event) + return; + GF_FREE (shd_event->path); + + return; +} + + +gf_boolean_t +afr_shd_is_subvol_local (xlator_t *this, int subvol) +{ + char *pathinfo = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int ret = 0; + gf_boolean_t is_local = _gf_false; + loc_t loc = {0, }; + + priv = this->private; + + loc.inode = this->itable->root; + uuid_copy (loc.gfid, loc.inode->gfid); + + ret = syncop_getxattr (priv->children[subvol], &loc, &xattr, + GF_XATTR_PATHINFO_KEY); + if (ret) + return _gf_false; + if (!xattr) + return _gf_false; + + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret) + return _gf_false; + + afr_local_pathinfo (pathinfo, &is_local); + + gf_log (this->name, GF_LOG_DEBUG, "subvol %s is %slocal", + priv->children[subvol]->name, is_local? "" : "not "); + + return is_local; +} + + +int +__afr_shd_healer_wait (struct subvol_healer *healer) +{ + afr_private_t *priv = NULL; + struct timespec wait_till = {0, }; + int ret = 0; + + priv = healer->this->private; + +disabled_loop: + wait_till.tv_sec = time (NULL) + 60; + + while (!healer->rerun) { + ret = pthread_cond_timedwait (&healer->cond, + &healer->mutex, + &wait_till); + if (ret == ETIMEDOUT) + break; + } + + ret = healer->rerun; + healer->rerun = 0; + + if (!priv->shd.enabled) + goto disabled_loop; + + return ret; +} + + +int +afr_shd_healer_wait (struct subvol_healer *healer) +{ + int ret = 0; + + pthread_mutex_lock (&healer->mutex); + { + ret = __afr_shd_healer_wait (healer); + } + pthread_mutex_unlock (&healer->mutex); + + return ret; +} + + +gf_boolean_t +safe_break (struct subvol_healer *healer) +{ + gf_boolean_t ret = _gf_false; + + pthread_mutex_lock (&healer->mutex); + { + if (healer->rerun) + goto unlock; + + healer->running = _gf_false; + ret = _gf_true; + } +unlock: + pthread_mutex_unlock (&healer->mutex); + + return ret; +} + + +inode_t * +afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid) +{ + inode_t *inode = NULL; + int ret = 0; + loc_t loc = {0, }; + struct iatt iatt = {0, }; + + inode = inode_find (this->itable, gfid); + if (inode) + goto out; + + loc.inode = inode_new (this->itable); + if (!loc.inode) + goto out; + uuid_copy (loc.gfid, gfid); + + ret = syncop_lookup (subvol, &loc, NULL, &iatt, NULL, NULL); + if (ret < 0) + goto out; + + inode = inode_link (loc.inode, NULL, NULL, &iatt); + if (inode) + inode_lookup (inode); +out: + loc_wipe (&loc); + return inode; +} + + +fd_t * +afr_shd_index_opendir (xlator_t *this, int child) +{ + fd_t *fd = NULL; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + loc_t rootloc = {0, }; + inode_t *inode = NULL; + int ret = 0; + dict_t *xattr = NULL; + void *index_gfid = NULL; + + priv = this->private; + subvol = priv->children[child]; + + rootloc.inode = inode_ref (this->itable->root); + uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_INDEX_GFID); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); + if (ret) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "index-dir gfid for %s: %s", + subvol->name, uuid_utoa (index_gfid)); + + inode = afr_shd_inode_find (this, subvol, index_gfid); + if (!inode) + goto out; + fd = fd_anonymous (inode); +out: + loc_wipe (&rootloc); + if (xattr) + dict_unref (xattr); + return fd; +} + + +int +afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name) +{ + loc_t loc = {0, }; + int ret = 0; + + loc.parent = inode_ref (inode); + loc.name = name; + + ret = syncop_unlink (subvol, &loc); + + loc_wipe (&loc); + return ret; +} + + +int +afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent, + const char *bname) +{ + int ret = -1; + + ret = afr_selfheal_name (THIS, parent, bname); + + return ret; +} + +int +afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) +{ + int ret = 0; + eh_t *eh = NULL; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + crawl_event_t *crawl_event = NULL; + + this = healer->this; + priv = this->private; + shd = &priv->shd; + crawl_event = &healer->crawl_event; + + subvol = priv->children[child]; + + ret = afr_selfheal (this, gfid); + + if (ret == -EIO) { + eh = shd->split_brain; + crawl_event->split_brain_count++; + } else if (ret < 0) { + eh = shd->heal_failed; + crawl_event->heal_failed_count++; + } else if (ret == 0) { + eh = shd->healed; + crawl_event->healed_count++; + } + + afr_shd_gfid_to_path (this, subvol, gfid, &path); + if (!path) + return ret; + + if (eh) { + shd_event = GF_CALLOC (1, sizeof(*shd_event), + gf_afr_mt_shd_event_t); + if (!shd_event) { + GF_FREE (path); + return ret; + } + + shd_event->child = child; + shd_event->path = path; + + if (eh_save_history (eh, shd_event) < 0) { + GF_FREE (shd_event); + GF_FREE (path); + } + } + return ret; +} + + +void +afr_shd_sweep_prepare (struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; + + event = &healer->crawl_event; + + event->healed_count = 0; + event->split_brain_count = 0; + event->heal_failed_count = 0; + + time (&event->start_time); + event->end_time = 0; +} + + +void +afr_shd_sweep_done (struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; + crawl_event_t *history = NULL; + afr_self_heald_t *shd = NULL; + + event = &healer->crawl_event; + shd = &(((afr_private_t *)healer->this->private)->shd); + + time (&event->end_time); + history = memdup (event, sizeof (*event)); + event->start_time = 0; + + if (!history) + return; + + if (eh_save_history (shd->statistics[healer->subvol], history) < 0) + GF_FREE (history); +} + + +int +afr_shd_index_sweep (struct subvol_healer *healer) +{ + xlator_t *this = NULL; + int child = -1; + fd_t *fd = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + uuid_t gfid; + int ret = 0; + int count = 0; + + this = healer->this; + child = healer->subvol; + priv = this->private; + subvol = priv->children[child]; + + fd = afr_shd_index_opendir (this, child); + if (!fd) { + gf_log (this->name, GF_LOG_WARNING, + "unable to opendir index-dir on %s", subvol->name); + return -errno; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!priv->shd.enabled) { + ret = -EBUSY; + break; + } + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", + entry->d_name); + + ret = uuid_parse (entry->d_name, gfid); + if (ret) + continue; + + ret = afr_shd_selfheal (healer, child, gfid); + if (ret == 0) + count++; + + if (ret == -ENOENT || ret == -ESTALE) { + afr_shd_index_purge (subvol, fd->inode, + entry->d_name); + ret = 0; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + if (!ret) + ret = count; + return ret; +} + + +int +afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode) +{ + fd_t *fd = NULL; + xlator_t *this = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + int ret = 0; + + this = healer->this; + priv = this->private; + subvol = priv->children[healer->subvol]; + + fd = fd_anonymous (inode); + if (!fd) + return -errno; + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, &entries))) { + if (ret < 0) + break; + + ret = gf_link_inodes_from_dirent (this, fd->inode, &entries); + if (ret) + break; + + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!priv->shd.enabled) { + ret = -EBUSY; + break; + } + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + afr_shd_selfheal_name (healer, healer->subvol, + inode->gfid, entry->d_name); + + afr_shd_selfheal (healer, healer->subvol, + entry->d_stat.ia_gfid); + + if (entry->d_stat.ia_type == IA_IFDIR) { + ret = afr_shd_full_sweep (healer, entry->inode); + if (ret) + break; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + return ret; +} + + +void * +afr_shd_index_healer (void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int ret = 0; + + healer = data; + THIS = this = healer->this; + + for (;;) { + afr_shd_healer_wait (healer); + + ASSERT_LOCAL(this, healer); + + do { + gf_log (this->name, GF_LOG_DEBUG, + "starting index sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + + afr_shd_sweep_prepare (healer); + + ret = afr_shd_index_sweep (healer); + + afr_shd_sweep_done (healer); + /* + As long as at least one gfid was + healed, keep retrying. We may have + just healed a directory and thereby + created entries for other gfids which + could not be healed thus far. + */ + + gf_log (this->name, GF_LOG_DEBUG, + "finished index sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + /* + Give a pause before retrying to avoid a busy loop + in case the only entry in index is because of + an ongoing I/O. + */ + sleep (1); + } while (ret > 0); + } + + return NULL; +} + + +void * +afr_shd_full_healer (void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int run = 0; + + healer = data; + THIS = this = healer->this; + + for (;;) { + pthread_mutex_lock (&healer->mutex); + { + run = __afr_shd_healer_wait (healer); + if (!run) + healer->running = _gf_false; + } + pthread_mutex_unlock (&healer->mutex); + + if (!run) + break; + + ASSERT_LOCAL(this, healer); + + gf_log (this->name, GF_LOG_INFO, + "starting full sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + + afr_shd_sweep_prepare (healer); + + afr_shd_full_sweep (healer, this->itable->root); + + afr_shd_sweep_done (healer); + + gf_log (this->name, GF_LOG_INFO, + "finished full sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + } + + return NULL; +} + + +int +afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer) +{ + int ret = 0; + + ret = pthread_mutex_init (&healer->mutex, NULL); + if (ret) + goto out; + + ret = pthread_cond_init (&healer->cond, NULL); + if (ret) + goto out; + + healer->this = this; + healer->running = _gf_false; + healer->rerun = _gf_false; + healer->local = _gf_false; +out: + return ret; +} + + +int +afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer, + void *(threadfn)(void *)) +{ + int ret = 0; + + pthread_mutex_lock (&healer->mutex); + { + if (healer->running) { + pthread_cond_signal (&healer->cond); + } else { + ret = gf_thread_create (&healer->thread, NULL, + threadfn, healer); + if (ret) + goto unlock; + healer->running = 1; + } + + healer->rerun = 1; + } +unlock: + pthread_mutex_unlock (&healer->mutex); + + return ret; +} + + +int +afr_shd_full_healer_spawn (xlator_t *this, int subvol) +{ + return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol), + afr_shd_full_healer); +} + + +int +afr_shd_index_healer_spawn (xlator_t *this, int subvol) +{ + return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol), + afr_shd_index_healer); +} + + +int +afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output, + crawl_event_t *crawl_event) +{ + int ret = 0; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = 0; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + int child = -1; + + child = crawl_event->child; + healed_count = crawl_event->healed_count; + split_brain_count = crawl_event->split_brain_count; + heal_failed_count = crawl_event->heal_failed_count; + crawl_type = crawl_event->crawl_type; + + if (!crawl_event->start_time) + goto out; + + start_time_str = gf_strdup (ctime (&crawl_event->start_time)); + + if (crawl_event->end_time) + end_time_str = gf_strdup (ctime (&crawl_event->end_time)); + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_healed_count to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, split_brain_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_split_brain_count to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_str (output, key, crawl_type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_type to output"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, heal_failed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_healed_failed_count to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, start_time_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_start_time to outout"); + goto out; + } else { + start_time_str = NULL; + } + + if (!end_time_str) + progress = 1; + else + progress = 0; + + snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, + xl_id, child, count); + if (!end_time_str) + end_time_str = gf_strdup ("Could not determine the end time"); + ret = dict_set_dynstr (output, key, end_time_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_end_time to outout"); + goto out; + } else { + end_time_str = NULL; + } + + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, + xl_id, child, count); + + ret = dict_set_int32 (output, key, progress); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_inprogress to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not increment the counter."); + goto out; + } +out: + GF_FREE (start_time_str); + GF_FREE (end_time_str); + return ret; +} + + +int +afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path, + struct timeval *tv) +{ + int ret = -1; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); + ret = dict_set_dynstr (output, key, path); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", + path); + goto out; + } + + if (tv) { + snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, + child, count); + ret = dict_set_uint32 (output, key, tv->tv_sec); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", + path); + goto out; + } + } + + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); + goto out; + } + + ret = 0; +out: + return ret; +} + + +int +afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p) +{ + loc_t loc = {0,}; + char *path = NULL; + dict_t *xattr = NULL; + int ret = 0; + + uuid_copy (loc.gfid, gfid); + loc.inode = inode_new (this->itable); + + ret = syncop_getxattr (subvol, &loc, &xattr, GFID_TO_PATH_KEY); + loc_wipe (&loc); + if (ret) + return ret; + + ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); + if (ret || !path) + return -EINVAL; + + *path_p = gf_strdup (path); + if (!*path_p) + return -ENOMEM; + return 0; +} + + +int +afr_shd_gather_index_entries (xlator_t *this, int child, dict_t *output) +{ + fd_t *fd = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + uuid_t gfid; + int ret = 0; + int count = 0; + char *path = NULL; + + priv = this->private; + subvol = priv->children[child]; + + fd = afr_shd_index_opendir (this, child); + if (!fd) { + gf_log (this->name, GF_LOG_WARNING, + "unable to opendir index-dir on %s", subvol->name); + return -errno; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", + entry->d_name); + + ret = uuid_parse (entry->d_name, gfid); + if (ret) + continue; + + path = NULL; + ret = afr_shd_gfid_to_path (this, subvol, gfid, &path); + + if (ret == -ENOENT || ret == -ESTALE) { + afr_shd_index_purge (subvol, fd->inode, + entry->d_name); + ret = 0; + continue; + } + + ret = afr_shd_dict_add_path (this, output, child, path, + NULL); + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + if (!ret) + ret = count; + return ret; +} + + +int +afr_add_shd_event (circular_buffer_t *cb, void *data) +{ + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + shd_event = cb->data; + + if (!shd->index_healers[shd_event->child].local) + return 0; + + path = gf_strdup (shd_event->path); + if (!path) + return -ENOMEM; + + afr_shd_dict_add_path (this, output, shd_event->child, path, + &cb->tv); + return 0; +} + +int +afr_add_crawl_event (circular_buffer_t *cb, void *data) +{ + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + crawl_event_t *crawl_event = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + crawl_event = cb->data; + + if (!shd->index_healers[crawl_event->child].local) + return 0; + + afr_shd_dict_add_crawl_event (this, output, crawl_event); + + return 0; +} + + +int +afr_selfheal_daemon_init (xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + shd = &priv->shd; + + this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); + if (!this->itable) + goto out; + + shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->index_healers) + goto out; + + for (i = 0; i < priv->child_count; i++) { + shd->index_healers[i].subvol = i; + ret = afr_shd_healer_init (this, &shd->index_healers[i]); + if (ret) + goto out; + } + + shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->full_healers) + goto out; + for (i = 0; i < priv->child_count; i++) { + shd->full_healers[i].subvol = i; + ret = afr_shd_healer_init (this, &shd->full_healers[i]); + if (ret) + goto out; + } + + shd->healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->healed) + goto out; + + shd->heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->heal_failed) + goto out; + + shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->split_brain) + goto out; + + shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!shd->statistics) + goto out; + + for (i = 0; i < priv->child_count ; i++) { + shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + afr_destroy_crawl_event_data); + if (!shd->statistics[i]) + goto out; + shd->full_healers[i].crawl_event.child = i; + shd->full_healers[i].crawl_event.crawl_type = "FULL"; + shd->index_healers[i].crawl_event.child = i; + shd->index_healers[i].crawl_event.crawl_type = "INDEX"; + } + + ret = 0; +out: + return ret; +} + + +int +afr_selfheal_childup (xlator_t *this, int subvol) +{ + afr_shd_index_healer_spawn (this, subvol); + + return 0; +} + + +int64_t +afr_shd_get_index_count (xlator_t *this, int i) +{ + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + uint64_t count = 0; + loc_t rootloc = {0, }; + dict_t *xattr = NULL; + int ret = -1; + + priv = this->private; + subvol = priv->children[i]; + + rootloc.inode = inode_ref (this->itable->root); + uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_INDEX_COUNT); + loc_wipe (&rootloc); + + if (ret < 0) + return -1; + + ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, &count); + if (ret) + return -1; + return count; +} + + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) +{ + gf_xl_afr_op_t op = GF_AFR_OP_INVALID; + int ret = 0; + int xl_id = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + struct subvol_healer *healer = NULL; + int i = 0; + char key[64]; + int op_ret = 0; + int64_t cnt = 0; + + priv = this->private; + shd = &priv->shd; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == -1) + goto out; + + ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); + if (ret) + goto out; + ret = dict_get_int32 (input, this->name, &xl_id); + if (ret) + goto out; + ret = dict_set_int32 (output, this->name, xl_id); + if (ret) + goto out; + switch (op) { + case GF_AFR_OP_HEAL_INDEX: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + snprintf (key, 64, "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + ret = dict_set_str (output, key, + "Brick is not connected"); + } else if (AFR_COUNT (priv->child_up, + priv->child_count) < 2) { + ret = dict_set_str (output, key, + "< 2 bricks in replica are up"); + } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { + ret = dict_set_str (output, key, + "Brick is remote"); + } else { + ret = dict_set_str (output, key, + "Started self-heal"); + afr_shd_index_healer_spawn (this, i); + op_ret = 0; + } + } + break; + case GF_AFR_OP_HEAL_FULL: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->full_healers[i]; + snprintf (key, 64, "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + ret = dict_set_str (output, key, + "Brick is not connected"); + } else if (AFR_COUNT (priv->child_up, + priv->child_count) < 2) { + ret = dict_set_str (output, key, + "< 2 bricks in replica are up"); + } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { + ret = dict_set_str (output, key, + "Brick is remote"); + } else { + ret = dict_set_str (output, key, + "Started self-heal"); + afr_shd_full_healer_spawn (this, i); + op_ret = 0; + } + } + break; + case GF_AFR_OP_INDEX_SUMMARY: + for (i = 0; i < priv->child_count; i++) + if (shd->index_healers[i].local) + afr_shd_gather_index_entries (this, i, output); + break; + case GF_AFR_OP_HEALED_FILES: + eh_dump (shd->healed, output, afr_add_shd_event); + break; + case GF_AFR_OP_HEAL_FAILED_FILES: + eh_dump (shd->heal_failed, output, afr_add_shd_event); + break; + case GF_AFR_OP_SPLIT_BRAIN_FILES: + eh_dump (shd->split_brain, output, afr_add_shd_event); + break; + case GF_AFR_OP_STATISTICS: + for (i = 0; i < priv->child_count; i++) { + eh_dump (shd->statistics[i], output, + afr_add_crawl_event); + afr_shd_dict_add_crawl_event (this, output, + &shd->index_healers[i].crawl_event); + afr_shd_dict_add_crawl_event (this, output, + &shd->full_healers[i].crawl_event); + } + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i]) { + snprintf (key, 64, "%d-%d-status", xl_id, i); + ret = dict_set_str (output, key, + "Brick is not connected"); + } else { + snprintf (key, 64, "%d-%d-hardlinks", xl_id, i); + cnt = afr_shd_get_index_count (this, i); + if (cnt >= 0) { + ret = dict_set_uint64 (output, key, cnt); + } + op_ret = 0; + } + } + +// ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, +// STATISTICS_TO_BE_HEALED, +// output); + break; + + default: + gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); + break; + } +out: + dict_del (output, this->name); + return op_ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h new file mode 100644 index 000000000..10e229ee7 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -0,0 +1,72 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _AFR_SELF_HEALD_H +#define _AFR_SELF_HEALD_H + +#include <pthread.h> + + +typedef struct { + int child; + char *path; +} shd_event_t; + +typedef struct { + int child; + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; + + /* If start_time is 0, it means crawler is not in progress + and stats are not valid */ + time_t start_time; + /* If start_time is NOT 0 and end_time is 0, it means + cralwer is in progress */ + time_t end_time; + char *crawl_type; +} crawl_event_t; + +struct subvol_healer { + xlator_t *this; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; + crawl_event_t crawl_event; + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; +}; + +typedef struct { + gf_boolean_t iamshd; + gf_boolean_t enabled; + struct subvol_healer *index_healers; + struct subvol_healer *full_healers; + + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; +} afr_self_heald_t; + + +int +afr_selfheal_childup (xlator_t *this, int subvol); + +int +afr_selfheal_daemon_init (xlator_t *this); + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); + +#endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 828a6e9ab..205ff759e 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1,1118 +1,1579 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" #include "byte-order.h" +#include "common-utils.h" +#include "timer.h" #include "afr.h" #include "afr-transaction.h" #include <signal.h> +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); -static void -afr_pid_save (call_frame_t *frame) -{ - afr_local_t * local = NULL; - - local = frame->local; +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this); - local->saved_pid = frame->root->pid; -} +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume); -static void -afr_pid_restore (call_frame_t *frame) +int +__afr_txn_write_fop (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; local = frame->local; + priv = this->private; - frame->root->pid = local->saved_pid; -} + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } -static void -__mark_all_pending (int32_t *pending[], int child_count, - afr_transaction_type type) -{ - int i; - int j; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + local->transaction.wind (frame, this, i); - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (1); + if (!--call_count) + break; + } } + + return 0; } -static void -__mark_child_dead (int32_t *pending[], int child_count, int child, - afr_transaction_type type) +int +__afr_txn_write_done (call_frame_t *frame, xlator_t *this) { - int j; + afr_local_t *local = NULL; - j = afr_index_for_transaction_type (type); - - pending[child][j] = 0; -} + local = frame->local; + local->transaction.unwind (frame, this); -static void -__mark_fop_failed_on_fd (fd_t *fd, xlator_t *this, - int child_index) -{ - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + AFR_STACK_DESTROY (frame); - int ret = 0; + return 0; +} - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) - goto out; +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) +{ + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + local = frame->local; - fd_ctx->child_failed[child_index] = 1; -out: - return; + LOCK (&frame->lock); + { + fop_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + return fop_frame; } static void -__mark_failed_children (int32_t *pending[], int child_count, - xlator_t *this, fd_t *fd, afr_transaction_type type) +afr_save_lk_owner (call_frame_t *frame) { - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + afr_local_t * local = NULL; - int ret = 0; - int i = 0; - int j = 0; + local = frame->local; - ret = fd_ctx_get (fd, this, &ctx); + local->saved_lk_owner = frame->root->lk_owner; +} - if (ret < 0) - goto out; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; +static void +afr_restore_lk_owner (call_frame_t *frame) +{ + afr_local_t * local = NULL; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); + local = frame->local; - if (fd_ctx->child_failed[i]) - pending[i][j] = 0; - } - -out: - return; + frame->root->lk_owner = local->saved_lk_owner; } - -static void -__mark_down_children (int32_t *pending[], int child_count, - unsigned char *child_up, afr_transaction_type type) +void +__mark_all_success (call_frame_t *frame, xlator_t *this) { + afr_private_t *priv = NULL; + afr_local_t *local = NULL; int i; - int j; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); + local = frame->local; + priv = this->private; - if (!child_up[i]) - pending[i][j] = 0; - } + for (i = 0; i < priv->child_count; i++) { + local->transaction.failed_subvols[i] = 0; + } } -static void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type) +int +afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) { - int i; - int j; + afr_local_t *local = NULL; + fd_t *fd = NULL; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (-1); - } + local = frame->local; + fd = local->fd; + + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. + */ + afr_save_lk_owner (frame); + frame->root->lk_owner = + local->transaction.main_frame->root->lk_owner; + + if (local->pre_op_compat) + /* old mode, pre-op was done as afr_changelog_do() + just now, before OP */ + afr_changelog_pre_op_update (frame, this); + + /* The wake up needs to happen independent of + what type of fop arrives here. If it was + a write, then it has already inherited the + lock and changelog. If it was not a write, + then the presumption of the optimization (of + optimizing for successive write operations) + fails. + */ + if (fd) + afr_delayed_changelog_wake_up (this, fd); + local->transaction.fop (frame, this); + + return 0; } static int -__is_first_write_on_fd (xlator_t *this, fd_t *fd) +__changelog_enabled (afr_private_t *priv, afr_transaction_type type) { - int op_ret = 0; - int _ret = -1; + int ret = 0; - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + switch (type) { + case AFR_DATA_TRANSACTION: + if (priv->data_change_log) + ret = 1; - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); - - if (_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get fd ctx on fd=%p", - fd); - goto out; - } + break; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + case AFR_METADATA_TRANSACTION: + if (priv->metadata_change_log) + ret = 1; - if (fd_ctx->pre_op_done == 0) { - fd_ctx->pre_op_done = 1; - op_ret = 1; - } + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + if (priv->entry_change_log) + ret = 1; + + break; } -out: - UNLOCK (&fd->lock); - return op_ret; + return ret; } static int -__if_fd_pre_op_done (xlator_t *this, fd_t *fd) +__fop_changelog_needed (call_frame_t *frame, xlator_t *this) { + afr_private_t * priv = NULL; + afr_local_t * local = NULL; int op_ret = 0; - int _ret = -1; + afr_transaction_type type = -1; - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + priv = this->private; + local = frame->local; + type = local->transaction.type; - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); + if (__changelog_enabled (priv, type)) { + switch (local->op) { - if (_ret < 0) { - goto out; - } + case GF_FOP_WRITE: + case GF_FOP_FTRUNCATE: + op_ret = 1; + break; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + case GF_FOP_FLUSH: + op_ret = 0; + break; - if (fd_ctx->pre_op_done) { - fd_ctx->pre_op_done = 0; + default: op_ret = 1; } } -out: - UNLOCK (&fd->lock); return op_ret; } -static int -__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) { - int ret = 0; + int i = 0; + int ret = 0; + int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, }; - switch (type) { - case AFR_DATA_TRANSACTION: - if (priv->data_change_log) - ret = 1; - - break; + for (i = 0; i < priv->child_count; i++) { + if (!memcmp (pending_zero, pending[i], sizeof (pending_zero))) + /* don't set xattrs for non-pending servers */ + continue; - case AFR_METADATA_TRANSACTION: - if (priv->metadata_change_log) - ret = 1; + ret = dict_set_static_bin (xattr, priv->pending_key[i], + pending[i], + AFR_NUM_CHANGE_LOGS * sizeof (int)); + /* 3 = data+metadata+entry */ - break; + if (ret) + break; + } - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - if (priv->entry_change_log) - ret = 1; + return ret; +} - break; - - case AFR_FLUSH_TRANSACTION: - ret = 1; - } +int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; - return ret; -} + switch (type) { + case AFR_DATA_TRANSACTION: + ret = priv->child_count; + break; + case AFR_METADATA_TRANSACTION: + ret = priv->child_count; + break; -static int -__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - fd_t * fd = NULL; + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = priv->child_count; + break; + } - int op_ret = 0; + return ret; +} - priv = this->private; - local = frame->local; - - if (__changelog_enabled (priv, local->transaction.type)) { - switch (local->op) { - - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - /* - if it's a data transaction, we write the changelog - only on the first write on an fd - */ - - fd = local->fd; - if (!fd || __is_first_write_on_fd (this, fd)) - op_ret = 1; +/* {{{ pending */ - break; - case GF_FOP_FLUSH: - /* only do post-op on flush() */ +int +afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; - op_ret = 0; - break; + local = frame->local; + priv = this->private; + int_lock = &local->internal_lock; - default: - op_ret = 1; - } + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } + + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + int_lock->lock_cbk = local->transaction.done; + afr_unlock (frame, this); } - return op_ret; + return 0; } -static int -__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_inodelk_t *inodelk = NULL; + int i = 0; - int op_ret = 0; - afr_transaction_type type = -1; + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; +} - priv = this->private; - local = frame->local; - type = local->transaction.type; +unsigned char* +afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) +{ + unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; + switch (type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + /*Because same set of subvols participate in all lockee + * entities*/ + locked_nodes = int_lock->lockee[0].locked_nodes; + break; + } + return locked_nodes; +} - if (__changelog_enabled (priv, type)) { - switch (local->op) { - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - op_ret = 0; - break; +int +afr_changelog_call_count (afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned int child_count) +{ + int call_count = 0; - case GF_FOP_FLUSH: - op_ret = __if_fd_pre_op_done (this, local->fd); - break; + call_count = AFR_COUNT(pre_op_subvols, child_count); - default: - op_ret = 1; - } - } + if (type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; - return op_ret; + return call_count; } -static int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending) +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) { - int i; - int ret = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + priv = this->private; for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], 3 * sizeof (int32_t)); - /* 3 = data+metadata+entry */ - - if (ret < 0) - goto out; + if (local->transaction.failed_subvols[i]) + return _gf_false; } -out: - return ret; + return _gf_true; } -static int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +void +afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this) { - int ret = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; - switch (type) { - case AFR_FLUSH_TRANSACTION: - case AFR_DATA_TRANSACTION: - ret = priv->data_lock_server_count; - break; + priv = this->private; + local = frame->local; - case AFR_METADATA_TRANSACTION: - ret = priv->metadata_lock_server_count; - break; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least on subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; + break; + } + i_errno = local->replies[i].op_errno; + + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; + break; + } - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - ret = priv->entry_lock_server_count; - break; + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; + } } - return ret; + if (matching_errors) + __mark_all_success (frame, this); } -/* {{{ unlock */ - -int32_t -afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +int +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { - afr_local_t *local; - int call_count = 0; + afr_private_t * priv = this->private; + int i = 0; + int ret = 0; + int idx = 0; + afr_local_t * local = NULL; + dict_t *xattr = NULL; + int nothing_failed = 1; + gf_boolean_t need_undirty = _gf_false; - local = frame->local; + local = frame->local; + idx = afr_index_for_transaction_type (local->transaction.type); - LOCK (&frame->lock); - { - call_count = --local->call_count; + nothing_failed = afr_txn_nothing_failed (frame, this); + + if (afr_changelog_pre_op_uninherit (frame, this)) + need_undirty = _gf_false; + else + need_undirty = _gf_true; + + if (nothing_failed && !need_undirty) { + afr_changelog_post_op_done (frame, this); + goto out; } - UNLOCK (&frame->lock); - if (call_count == 0) { - local->transaction.done (frame, this); + xattr = dict_new (); + if (!xattr) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; } - - return 0; -} + if (need_undirty) { + local->dirty[idx] = hton32(-1); -int -afr_unlock (call_frame_t *frame, xlator_t *this) -{ - struct flock flock; + ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } - int i = 0; - int call_count = 0; + } + if (!nothing_failed) { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xattr, local->pending); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } + + } + + afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done); +out: + if (xattr) + dict_unref (xattr); + + return 0; +} + + +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) +{ afr_local_t *local = NULL; - afr_private_t * priv = this->private; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; local = frame->local; + priv = this->private; + fd = local->fd; - /* - pid has been restored to saved_pid in the fop, - so set it back to frame->root - */ + type = afr_index_for_transaction_type (local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; - frame->root->pid = (long) frame->root; + if (!fd) + return !local->transaction.dirtied; - call_count = afr_locked_nodes_count (local->transaction.locked_nodes, - priv->child_count); - - if (call_count == 0) { - local->transaction.done (frame, this); - return 0; - } + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) - call_count *= 2; - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - flock.l_start = local->transaction.start; - flock.l_len = local->transaction.len; - flock.l_type = F_UNLCK; - - if (local->transaction.locked_nodes[i]) { - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: - - if (local->fd) { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); - } else { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); - } - - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &local->transaction.new_parent_loc, - local->transaction.new_basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - call_count--; - - /* fall through */ - - case AFR_ENTRY_TRANSACTION: - if (local->fd) { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - } else { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + if (local->transaction.no_uninherit) + return _gf_false; - } - break; + /* This function must be idempotent. So check if we + were called before and return the same answer again. + + It is important to keep this function idempotent for + the call in afr_changelog_post_op_safe() to not have + side effects on the call from afr_changelog_post_op_now() + */ + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + + LOCK(&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; } - - if (!--call_count) - break; } - } - return 0; -} + if (fd_ctx->inherited[type]) { + ret = _gf_true; + fd_ctx->inherited[type]--; + } else if (fd_ctx->on_disk[type]) { + ret = _gf_false; + fd_ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; + } -/* }}} */ + if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = 0; + } + } +unlock: + UNLOCK(&fd->lock); + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; -/* {{{ pending */ + return ret; +} -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) + +gf_boolean_t +afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int call_count = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; - priv = this->private; local = frame->local; + priv = this->private; + fd = local->fd; - LOCK (&frame->lock); + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; + + type = afr_index_for_transaction_type (local->transaction.type); + + if (!fd) + return _gf_false; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; + + LOCK(&fd->lock); { - call_count = --local->call_count; + if (!fd_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } + } + + fd_ctx->inherited[type]++; + + ret = _gf_true; + + local->transaction.inherited = _gf_true; } - UNLOCK (&frame->lock); +unlock: + UNLOCK(&fd->lock); - if (call_count == 0) { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); + return ret; +} + + +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; + + local = frame->local; + priv = this->private; + fd = local->fd; + + if (!fd) + return _gf_false; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; + + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ + return _gf_false; + + if (!local->transaction.dirtied) + return _gf_false; + + if (!afr_txn_nothing_failed (frame, this)) + return _gf_false; + + type = afr_index_for_transaction_type (local->transaction.type); + + ret = _gf_false; + + LOCK(&fd->lock); + { + if (!fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = + local->transaction.pre_op[i]; } else { - afr_unlock (frame, this); + for (i = 0; i < priv->child_count; i++) + if (fd_ctx->pre_op_done[type][i] != + local->transaction.pre_op[i]) { + local->transaction.no_uninherit = 1; + goto unlock; + } } + fd_ctx->on_disk[type]++; + + ret = _gf_true; } +unlock: + UNLOCK(&fd->lock); - return 0; + return ret; } -int -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +int +afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - afr_private_t * priv = this->private; + afr_local_t *local = NULL; + int call_count = -1; - int ret = 0; - int i = 0; - int call_count = 0; - - afr_local_t * local = NULL; - dict_t * xattr = dict_ref (get_new_dict ()); + local = frame->local; - local = frame->local; + if (op_ret == -1) + afr_transaction_fop_failed (frame, this, (long) cookie); - __mark_down_children (local->pending, priv->child_count, - local->child_up, local->transaction.type); - - if (local->op == GF_FOP_FLUSH) { - __mark_failed_children (local->pending, priv->child_count, - this, local->fd, - local->transaction.type); - } + call_count = afr_frame_return (frame); - call_count = afr_up_children_count (priv->child_count, local->child_up); + if (call_count == 0) + local->transaction.changelog_resume (frame, this); - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } + return 0; +} - local->call_count = call_count; + +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_changelog_call_count (local->transaction.type, + local->transaction.pre_op, + priv->child_count); if (call_count == 0) { - /* no child is up */ - dict_unref (xattr); - afr_unlock (frame, this); + changelog_resume (frame, this); return 0; } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - ret = afr_set_pending_dict (priv, xattr, - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: - { - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr); - } - break; + local->call_count = call_count; + + local->transaction.changelog_resume = changelog_resume; - case AFR_ENTRY_RENAME_TRANSACTION: - { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + if (!local->fd) { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr); - - call_count--; - } - - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ - - ret = afr_set_pending_dict (priv, xattr, - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - /* fall through */ - - case AFR_ENTRY_TRANSACTION: - { - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr); - } + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + } else { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + } + break; + case AFR_ENTRY_RENAME_TRANSACTION: + + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + call_count--; + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + if (local->fd) + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + else + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); break; - } - - if (!--call_count) - break; } - } - - dict_unref (xattr); + + if (!--call_count) + break; + } + return 0; } -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +int +afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = this->private; - loc_t * loc = NULL; + afr_private_t * priv = this->private; + int i = 0; + int ret = 0; + int call_count = 0; + int op_errno = 0; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + unsigned char *locked_nodes = NULL; + unsigned char *pending_subvols = NULL; + int idx = -1; + gf_boolean_t pre_nop = _gf_true; + dict_t *xdata_req = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + idx = afr_index_for_transaction_type (local->transaction.type); - int call_count = -1; - int child_index = (long) cookie; + locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); - local = frame->local; - loc = &local->loc; + pending_subvols = alloca0 (priv->child_count); - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->child_up[child_index] = 0; - - if (op_errno == ENOTSUP) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop not supported by %s", - priv->children[child_index]->name); - local->op_ret = -1; - - } else if (!child_went_down (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop failed on child %s: %s", - priv->children[child_index]->name, - strerror (op_errno)); - } - local->op_errno = op_errno; + for (i = 0; i < priv->child_count; i++) { + if (locked_nodes[i]) { + local->transaction.pre_op[i] = 1; + call_count++; + } else { + pending_subvols[i] = 1; } + } + + /* TBD: quorum check w/ call_count */ - call_count = --local->call_count; + if (call_count == 0) { + op_errno = ENOTCONN; + goto err; } - UNLOCK (&frame->lock); - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOTSUP)) { - local->transaction.resume (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + xdata_req = dict_new(); + if (!xdata_req) { + op_errno = ENOMEM; + goto err; + } + + pre_nop = _gf_true; - afr_pid_restore (frame); + if (afr_changelog_pre_op_inherit (frame, this)) + goto next; - local->transaction.fop (frame, this); + if (call_count < priv->child_count) { + /* For subvols we are not performing operation on, + mark them as pending up-front along with the FOP + so that we can safely defer unmarking dirty until + later. + */ + for (i = 0; i < priv->child_count; i++) { + if (pending_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xdata_req, + local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; } + pre_nop = _gf_false; } - return 0; + if (call_count > 1 && + (local->transaction.type == AFR_DATA_TRANSACTION || + !local->optimistic_change_log)) { + + /* If we are performing change on only one subvol, no + need to mark dirty, because we are setting the pending + counts already anyways + */ + local->dirty[idx] = hton32(1); + + ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + pre_nop = _gf_false; + local->transaction.dirtied = 1; + } + + if (pre_nop) + goto next; + + if (!local->pre_op_compat) { + dict_copy (xdata_req, local->xdata_req); + goto next; + } + + afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop); + + if (xdata_req) + dict_unref (xdata_req); + + return 0; +next: + afr_transaction_perform_fop (frame, this); + + if (xdata_req) + dict_unref (xdata_req); + + return 0; +err: + local->internal_lock.lock_cbk = local->transaction.done; + local->op_ret = -1; + local->op_errno = op_errno; + + afr_unlock (frame, this); + + if (xdata_req) + dict_unref (xdata_req); + + return 0; } -int -afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +int +afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = this->private; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - int i = 0; - int ret = 0; - int call_count = 0; - dict_t *xattr = NULL; + local = frame->local; + int_lock = &local->internal_lock; - afr_local_t *local = NULL; + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "Blocking inodelks failed."); + local->transaction.done (frame, this); + } else { - local = frame->local; - xattr = get_new_dict (); - dict_ref (xattr); + gf_log (this->name, GF_LOG_DEBUG, + "Blocking inodelks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); + } - call_count = afr_up_children_count (priv->child_count, - local->child_up); + return 0; +} - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } - if (call_count == 0) { - /* no child is up */ - dict_unref (xattr); - afr_unlock (frame, this); - return 0; - } +int +afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + /* Initiate blocking locks if non-blocking has failed */ + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Non blocking inodelks failed. Proceeding to blocking"); + int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; + afr_blocking_lock (frame, this); + } else { + + gf_log (this->name, GF_LOG_DEBUG, + "Non blocking inodelks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); + } - local->call_count = call_count; + return 0; +} - __mark_all_pending (local->pending, priv->child_count, - local->transaction.type); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - ret = afr_set_pending_dict (priv, xattr, - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: - { - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr); - } - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr); +int +afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - call_count--; - } + local = frame->local; + int_lock = &local->internal_lock; + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "Blocking entrylks failed."); + local->transaction.done (frame, this); + } else { - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + gf_log (this->name, GF_LOG_DEBUG, + "Blocking entrylks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); + } - ret = afr_set_pending_dict (priv, xattr, - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - /* fall through */ - - case AFR_ENTRY_TRANSACTION: - { - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr); - } + return 0; +} - break; - } - if (!--call_count) - break; - } - } +int +afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - dict_unref (xattr); - return 0; + local = frame->local; + int_lock = &local->internal_lock; + + /* Initiate blocking locks if non-blocking has failed */ + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Non blocking entrylks failed. Proceeding to blocking"); + int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; + afr_blocking_lock (frame, this); + } else { + + gf_log (this->name, GF_LOG_DEBUG, + "Non blocking entrylks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); + } + + return 0; +} + + +int +afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "Blocking entrylks failed."); + local->transaction.done (frame, this); + } else { + + gf_log (this->name, GF_LOG_DEBUG, + "Blocking entrylks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); + } + return 0; +} + + +int +afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + GF_ASSERT (!int_lock->higher_locked); + + int_lock->lock_cbk = afr_post_blocking_rename_cbk; + afr_blocking_lock (frame, this); + + return 0; +} + + +int +afr_set_transaction_flock (afr_local_t *local) +{ + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + + int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; + + return 0; +} + +int +afr_lock_rec (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + afr_set_transaction_flock (local); + + int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; + + afr_nonblocking_inodelk (frame, this); + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + + int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; + afr_nonblocking_entrylk (frame, this); + break; + + case AFR_ENTRY_TRANSACTION: + int_lock->lk_basename = local->transaction.basename; + if (&local->transaction.parent_loc) + int_lock->lk_loc = &local->transaction.parent_loc; + else + GF_ASSERT (local->fd); + + int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; + afr_nonblocking_entrylk (frame, this); + break; + } + + return 0; } + +int +afr_lock (call_frame_t *frame, xlator_t *this) +{ + afr_set_lock_number (frame, this); + + return afr_lock_rec (frame, this); +} + + /* }}} */ -/* {{{ lock */ +int +afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +{ + if (__fop_changelog_needed (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + afr_transaction_perform_fop (frame, this); + } + + return 0; +} -static -int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index); -int32_t -afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int done = 0; - int child_index = (long) cookie; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - int call_count = 0; + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: - local = frame->local; - priv = this->private; + - changelog piggybacking + - eager locking + */ - LOCK (&frame->lock); - { - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - /* wait for the other lock to return */ - call_count = --local->call_count; - } + priv = this->private; + if (!priv) + return; - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); - local->op_ret = op_ret; - done = 1; - } + if (!priv->post_op_delay_secs) + return; - local->child_up[child_index] = 0; - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOSYS)) { - afr_unlock (frame, this); - } else { - local->transaction.locked_nodes[child_index] = 1; - local->transaction.lock_count++; - afr_lock_rec (frame, this, child_index + 1); - } - } + local = frame->local; + if (!local->transaction.eager_lock_on) + return; - return 0; + if (!local) + return; + + if (!local->fd) + return; + + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; } +gf_boolean_t +afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + if (!fd) { + /* If false is returned, it may keep on taking eager-lock + * which may lead to starvation, so return true to avoid that. + */ + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd"); + return _gf_true; + } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 + */ + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_true; -static loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) + if (fd_ctx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +} + + +gf_boolean_t +is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) { - int ret = 0; + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; - ret = strcmp (l1->path, l2->path); - - if (ret == 0) - ret = strcmp (b1, b2); + local = frame->local; + if (!local) + goto out; - if (ret <= 0) - return l1; - else - return l2; + if (!local->delayed_post_op) + goto out; + + //Mark pending changelog ASAP + if (!afr_txn_nothing_failed (frame, this)) + goto out; + + if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) + goto out; + + res = _gf_true; +out: + return res; } -static -int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index) +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub); + +void +afr_delayed_changelog_wake_up_cbk (void *data) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + fd_t *fd = NULL; - struct flock flock; + fd = data; - loc_t * lower = NULL; - loc_t * higher = NULL; + afr_delayed_changelog_wake_up (THIS, fd); +} - const char *lower_name = NULL; - const char *higher_name = NULL; - local = frame->local; - priv = this->private; +/* SET operation */ +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; - flock.l_start = local->transaction.start; - flock.l_len = local->transaction.len; - flock.l_type = F_WRLCK; + fdctx = afr_fd_ctx_get (fd, this); - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); - if ((child_index == priv->child_count) && - local->transaction.lock_count == 0) { + return 0; +} - gf_log (this->name, GF_LOG_DEBUG, - "unable to lock on even one child"); +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; - local->op_ret = -1; - local->op_errno = EAGAIN; + fdctx = afr_fd_ctx_get (fd, this); + if (!fdctx) + return _gf_true; - local->transaction.done (frame, this); - - return 0; + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); + + return witness; +} + +int +afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret != 0) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); } - if ((child_index == priv->child_count) - || (local->transaction.lock_count == - afr_lock_server_count (priv, local->transaction.type))) { + call_count = afr_frame_return (frame); - /* we're done locking */ + if (call_count == 0) + afr_changelog_post_op_now (frame, this); - if (__changelog_needed_pre_op (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + return 0; +} - afr_pid_restore (frame); - local->transaction.fop (frame, this); - } +int +afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; - return 0; - } + local = frame->local; + priv = this->private; - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: - - if (local->fd) { - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->finodelk, - this->name, local->fd, - F_SETLKW, &flock); - - } else { - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->inodelk, - this->name, &local->loc, - F_SETLKW, &flock); - } - - break; - - case AFR_ENTRY_RENAME_TRANSACTION: + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + local->call_count = call_count; + + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); + if (!--call_count) + break; + } + + if (xdata) + dict_unref (xdata); + + return 0; +} + + +int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + if (afr_changelog_pre_op_uninherit (frame, this) && + afr_txn_nothing_failed (frame, this)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } + + return 0; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub) +{ + afr_fd_ctx_t *fd_ctx = NULL; + call_frame_t *prev_frame = NULL; + struct timespec delta = {0, }; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; + + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + + pthread_mutex_lock (&fd_ctx->delay_lock); { - local->call_count = 2; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - - /* TODO: these locks should be blocking */ - - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, lower, lower_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, higher, higher_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - break; + prev_frame = fd_ctx->delay_frame; + fd_ctx->delay_frame = NULL; + if (fd_ctx->delay_timer) + gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); + fd_ctx->delay_timer = NULL; + if (!frame) + goto unlock; + fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, + afr_delayed_changelog_wake_up_cbk, + fd); + fd_ctx->delay_frame = frame; } - - case AFR_ENTRY_TRANSACTION: - if (local->fd) { - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - } else { - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - } +unlock: + pthread_mutex_unlock (&fd_ctx->delay_lock); - break; +out: + if (prev_frame) { + local = prev_frame->local; + local->transaction.resume_stub = stub; + afr_changelog_post_op_now (prev_frame, this); + } else if (stub) { + call_resume (stub); } - - return 0; } -int32_t afr_lock (call_frame_t *frame, xlator_t *this) +void +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) { - afr_pid_save (frame); + afr_local_t *local = NULL; - frame->root->pid = (long) frame->root; + local = frame->local; - return afr_lock_rec (frame, this, 0); + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); } -/* }}} */ -int32_t +/* Wake up the sleeping/delayed post-op, and also register + a stub to have it resumed after this transaction + completely finishes. + + The @stub gets saved in @local and gets resumed in + afr_local_cleanup() + */ +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +{ + afr_delayed_changelog_post_op (this, NULL, fd, stub); +} + + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +{ + afr_delayed_changelog_post_op (this, NULL, fd, NULL); +} + + +int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t *local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - if (__changelog_needed_post_op (frame, this)) { - afr_changelog_post_op (frame, this); - } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - afr_unlock (frame, this); - } - } + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + afr_remove_eager_lock_stub (local); + } - return 0; + afr_restore_lk_owner (frame); + + afr_handle_symmetric_errors (frame, this); + + if (!local->pre_op_compat) + /* new mode, pre-op was done along + with OP */ + afr_changelog_pre_op_update (frame, this); + + if (__fop_changelog_needed (frame, this)) { + afr_changelog_post_op (frame, this); + } else { + afr_changelog_post_op_done (frame, this); + } + + return 0; } @@ -1121,54 +1582,141 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t * local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - switch (local->op) { - case GF_FOP_WRITE: - __mark_fop_failed_on_fd (local->fd, this, child_index); - break; - default: - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); - break; + local->transaction.failed_subvols[child_index] = 1; +} + + + + static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} + +void +afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; + + priv = this->private; + + if (!local->fd) + return; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + + if (!priv->eager_lock) + return; + + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; + + if (afr_are_multiple_fds_opened (local->fd, this)) + return; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); } +unlock: + UNLOCK (&local->fd->lock); } -int32_t +int afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + fd_t *fd = NULL; + int ret = -1; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - afr_transaction_local_init (local, priv); + local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; - local->transaction.resume = afr_transaction_resume; - local->transaction.type = type; + ret = afr_transaction_local_init (local, this); + if (ret < 0) + goto out; - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - if (__changelog_needed_pre_op (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + afr_transaction_eager_lock_init (local, this); - afr_pid_restore (frame); + if (local->fd && local->transaction.eager_lock_on) + afr_set_lk_owner (frame, this, local->fd); + else + afr_set_lk_owner (frame, this, frame->root); - local->transaction.fop (frame, this); - } - } else { - afr_lock (frame, this); - } + if (!local->transaction.eager_lock_on && local->loc.inode) { + fd = fd_lookup (local->loc.inode, frame->root->pid); + if (fd == NULL) + fd = fd_lookup_anonymous (local->loc.inode); - return 0; + if (fd) { + afr_delayed_changelog_wake_up (this, fd); + fd_unref (fd); + } + } + + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + afr_internal_lock_finish (frame, this); + } else { + afr_lock (frame, this); + } + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index c7a6490e7..77cc8eed0 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -1,30 +1,53 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ +#include "afr.h" + void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index); +int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); + +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); + +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); + +void +__mark_all_success (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); + +int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type); + +int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); + +int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this); +int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); +call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); + #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 816656122..ead08425f 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -28,2579 +19,731 @@ #define _CONFIG_H #include "config.h" #endif +#include "afr-common.c" -#include "glusterfs.h" -#include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "fd.h" - -#include "afr-inode-read.h" -#include "afr-inode-write.h" -#include "afr-dir-read.h" -#include "afr-dir-write.h" -#include "afr-transaction.h" - -#include "afr-self-heal.h" - - -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode) -{ - int ret = 0; - - uint64_t ctx = 0; - uint64_t split_brain = 0; - - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); - - if (ret < 0) - goto unlock; - - split_brain = ctx & 0xFFFFFFFF00000000ULL; - } -unlock: - UNLOCK (&inode->lock); - - return split_brain; -} - +struct volume_options options[]; -void -afr_set_split_brain (xlator_t *this, inode_t *inode, int32_t split_brain) +int32_t +notify (xlator_t *this, int32_t event, + void *data, ...) { - uint64_t ctx = 0; - int ret = 0; + int ret = -1; + va_list ap; + void *data2 = NULL; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + va_start (ap, data); + data2 = va_arg (ap, dict_t*); + va_end (ap); + ret = afr_notify (this, event, data, data2); - if (ret < 0) { - ctx = 0; - } + return ret; +} - ctx = (0x00000000FFFFFFFFULL & ctx) - | (split_brain & 0xFFFFFFFF00000000ULL); +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; - __inode_ctx_put (inode, this, ctx); - } - UNLOCK (&inode->lock); -} + if (!this) + return ret; + ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); -uint64_t -afr_read_child (xlator_t *this, inode_t *inode) -{ - int ret = 0; - - uint64_t ctx = 0; - uint64_t read_child = 0; - - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); - - if (ret < 0) - goto unlock; - - read_child = ctx & 0x00000000FFFFFFFFULL; + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; } -unlock: - UNLOCK (&inode->lock); - return read_child; + return ret; } -void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child) +int +xlator_subvolume_index (xlator_t *this, xlator_t *subvol) { - uint64_t ctx = 0; - int ret = 0; + int index = -1; + int i = 0; + xlator_list_t *list = NULL; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + list = this->children; - if (ret < 0) { - ctx = 0; + while (list) { + if (subvol == list->xlator || + strcmp (subvol->name, list->xlator->name) == 0) { + index = i; + break; } - - ctx = (0xFFFFFFFF00000000ULL & ctx) - | (0x00000000FFFFFFFFULL & read_child); - - __inode_ctx_put (inode, this, ctx); + list = list->next; + i++; } - UNLOCK (&inode->lock); -} - -/** - * afr_local_cleanup - cleanup everything in frame->local - */ - -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - - sh = &local->self_heal; - priv = this->private; - - if (sh->buf) - FREE (sh->buf); - - if (sh->xattr) { - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } - FREE (sh->xattr); - } - - if (sh->child_errno) - FREE (sh->child_errno); - - if (sh->pending_matrix) { - for (i = 0; i < priv->child_count; i++) { - FREE (sh->pending_matrix[i]); - } - FREE (sh->pending_matrix); - } - - if (sh->delta_matrix) { - for (i = 0; i < priv->child_count; i++) { - FREE (sh->delta_matrix[i]); - } - FREE (sh->delta_matrix); - } - - if (sh->sources) - FREE (sh->sources); - - if (sh->success) - FREE (sh->success); - - if (sh->healing_fd) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } - - loc_wipe (&sh->parent_loc); + return index; } - -void -afr_local_cleanup (afr_local_t *local, xlator_t *this) +void +fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype) { - int i; - afr_private_t * priv = NULL; - - if (!local) - return; - - afr_local_sh_cleanup (local, this); - - FREE (local->child_errno); - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (local->pending && local->pending[i]) - FREE (local->pending[i]); + if (priv->quorum_count && strcmp(qtype,"fixed")) { + gf_log(this->name,GF_LOG_WARNING, + "quorum-type %s overriding quorum-count %u", + qtype, priv->quorum_count); + } + if (!strcmp(qtype,"none")) { + priv->quorum_count = 0; + } + else if (!strcmp(qtype,"auto")) { + priv->quorum_count = AFR_QUORUM_AUTO; } - - FREE (local->pending); - - loc_wipe (&local->loc); - loc_wipe (&local->newloc); - - FREE (local->transaction.locked_nodes); - FREE (local->transaction.child_errno); - - FREE (local->transaction.basename); - FREE (local->transaction.new_basename); - - loc_wipe (&local->transaction.parent_loc); - loc_wipe (&local->transaction.new_parent_loc); - - if (local->fd) - fd_unref (local->fd); - - if (local->xattr_req) - dict_unref (local->xattr_req); - - FREE (local->child_up); - - { /* lookup */ - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); - } - - { /* getxattr */ - if (local->cont.getxattr.name) - FREE (local->cont.getxattr.name); - } - - { /* lk */ - if (local->cont.lk.locked_nodes) - FREE (local->cont.lk.locked_nodes); - } - - { /* checksum */ - if (local->cont.checksum.file_checksum) - FREE (local->cont.checksum.file_checksum); - if (local->cont.checksum.dir_checksum) - FREE (local->cont.checksum.dir_checksum); - } - - { /* create */ - if (local->cont.create.fd) - fd_unref (local->cont.create.fd); - } - - { /* writev */ - FREE (local->cont.writev.vector); - } - - { /* setxattr */ - if (local->cont.setxattr.dict) - dict_unref (local->cont.setxattr.dict); - } - - { /* removexattr */ - FREE (local->cont.removexattr.name); - } - - { /* symlink */ - FREE (local->cont.symlink.linkpath); - } } - int -afr_frame_return (call_frame_t *frame) +reconfigure (xlator_t *this, dict_t *options) { - afr_local_t *local = NULL; - int call_count = 0; + afr_private_t *priv = NULL; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + int ret = -1; + int index = -1; + char *qtype = NULL; - local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + GF_OPTION_RECONF ("afr-dirty-xattr", + priv->afr_dirty, options, str, + out); - return call_count; -} + GF_OPTION_RECONF ("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, options, bool, + out); -/** - * first_up_child - return the index of the first child that is up - */ + GF_OPTION_RECONF ("background-self-heal-count", + priv->background_self_heal_count, options, uint32, + out); -int -afr_first_up_child (afr_private_t *priv) -{ - xlator_t ** children = NULL; - int ret = -1; - int i = 0; - - LOCK (&priv->lock); - { - children = priv->children; - for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i]) { - ret = i; - break; - } - } - } - UNLOCK (&priv->lock); + GF_OPTION_RECONF ("metadata-self-heal", + priv->metadata_self_heal, options, bool, out); - return ret; -} + GF_OPTION_RECONF ("data-self-heal", priv->data_self_heal, options, str, + out); + GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options, + bool, out); -/** - * up_children_count - return the number of children that are up - */ + GF_OPTION_RECONF ("data-self-heal-window-size", + priv->data_self_heal_window_size, options, + uint32, out); -int -afr_up_children_count (int child_count, unsigned char *child_up) -{ - int i = 0; - int ret = 0; + GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options, + bool, out); - for (i = 0; i < child_count; i++) - if (child_up[i]) - ret++; - return ret; -} + GF_OPTION_RECONF ("metadata-change-log", + priv->metadata_change_log, options, bool, out); + GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options, + bool, out); -int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) -{ - int ret = 0; - int i; - - for (i = 0; i < child_count; i++) - if (locked_nodes[i]) - ret++; + GF_OPTION_RECONF ("data-self-heal-algorithm", + priv->data_self_heal_algorithm, options, str, out); - return ret; -} + GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); + GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, + options, uint32, out); -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index) -{ - ino64_t scaled_ino = -1; - - if (ino == ((uint64_t) -1)) { - scaled_ino = ((uint64_t) -1); - goto out; - } - - scaled_ino = (ino * child_count) + child_index; - -out: - return scaled_ino; -} - + if (read_subvol) { + index = xlator_subvolume_index (this, read_subvol); + if (index == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + read_subvol->name); + goto out; + } + priv->read_child = index; + } -int -afr_deitransform_orig (ino64_t ino, int child_count) -{ - int index = -1; + GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out); - index = ino % child_count; + if (read_subvol_index >-1) { + index=read_subvol_index; + if (index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + index); + goto out; + } + priv->read_child = index; + } - return index; -} + GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out); + GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); + GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, + uint32, out); + fix_quorum_options(this,priv,qtype); -int -afr_deitransform (ino64_t ino, int child_count) -{ - return 0; -} + GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, + uint32, out); + GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, + options, size_uint64, out); + /* Reset this so we re-discover in case the topology changed. */ + GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, + bool, out); -int -afr_self_heal_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; + GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, + bool, out); - local = frame->local; + GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options, + bool, out); - if (local->govinda_gOvinda) { - afr_set_split_brain (this, local->cont.lookup.inode, 1); - } else { - afr_set_split_brain (this, local->cont.lookup.inode, 0); - } + priv->did_discovery = _gf_false; - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr); + ret = 0; +out: + return ret; - return 0; } -int -afr_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct stat * lookup_buf = NULL; - - int call_count = -1; - int child_index = -1; - int first_up_child = -1; - - uint32_t open_fd_count = 0; - int ret = 0; - - child_index = (long) cookie; - priv = this->private; - - LOCK (&frame->lock); - { - local = frame->local; - - lookup_buf = &local->cont.lookup.buf; - - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; - - if (op_errno != ENOTCONN) { - if (local->op_errno != ESTALE) - local->op_errno = op_errno; - } - - if (op_errno == ESTALE) { - /* no matter what other subvolumes return for - * this call, ESTALE _must_ be sent to parent - */ - local->op_ret = -1; - local->op_errno = ESTALE; - } - goto unlock; - } - - if (afr_sh_has_metadata_pending (xattr, child_index, this)) - local->need_metadata_self_heal = 1; - - if (afr_sh_has_entry_pending (xattr, child_index, this)) - local->need_entry_self_heal = 1; - - if (afr_sh_has_data_pending (xattr, child_index, this)) - local->need_data_self_heal = 1; - - ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, - &open_fd_count); - local->open_fd_count += open_fd_count; - - /* in case of revalidate, we need to send stat of the - * child whose stat was sent during the first lookup. - * (so that time stamp does not vary with revalidate. - * in case it is down, stat of the fist success will - * be replied */ - - /* inode number should be preserved across revalidates */ - - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; - - local->cont.lookup.inode = inode; - local->cont.lookup.xattr = dict_ref (xattr); - - *lookup_buf = *buf; - - lookup_buf->st_ino = afr_itransform (buf->st_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } - - } else { - if (FILETYPE_DIFFERS (buf, lookup_buf)) { - /* mismatching filetypes with same name - -- Govinda !! GOvinda !!! - */ - local->govinda_gOvinda = 1; - } - - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->need_metadata_self_heal = 1; - } - - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->need_metadata_self_heal = 1; - } - - if (SIZE_DIFFERS (buf, lookup_buf) - && S_ISREG (buf->st_mode)) { - local->need_data_self_heal = 1; - } - - first_up_child = afr_first_up_child (priv); - - if (child_index == first_up_child) { - local->cont.lookup.buf.st_ino = - afr_itransform (buf->st_ino, - priv->child_count, - first_up_child); - } - - if (child_index == local->read_child_index) { - - /* - lookup has succeeded on the read child. - So use its inode number - */ - if (local->op_errno != ESTALE) - local->op_ret = op_ret; - - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); - - local->cont.lookup.inode = inode; - local->cont.lookup.xattr = dict_ref (xattr); - - *lookup_buf = *buf; - - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - local->read_child_index); - } - } - - } - - local->success_count++; - } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->op_ret == 0) { - /* KLUDGE: assuming DHT will not itransform in - revalidate */ - if (local->cont.lookup.inode->ino) - lookup_buf->st_ino = - local->cont.lookup.inode->ino; - } - - if (local->success_count && local->enoent_count) { - local->need_metadata_self_heal = 1; - local->need_data_self_heal = 1; - local->need_entry_self_heal = 1; - } - - if (local->success_count) { - /* check for split-brain case in previous lookup */ - if (afr_is_split_brain (this, - local->cont.lookup.inode)) - local->need_data_self_heal = 1; - } - - if ((local->need_metadata_self_heal - || local->need_data_self_heal - || local->need_entry_self_heal) - && (!local->open_fd_count)) { - - if (!local->cont.lookup.inode->st_mode) { - /* fix for RT #602 */ - local->cont.lookup.inode->st_mode = - lookup_buf->st_mode; - } - - afr_self_heal (frame, this, afr_self_heal_cbk); - } else { - AFR_STACK_UNWIND (frame, local->op_ret, - local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr); - } - } - - return 0; -} +static const char *favorite_child_warning_str = "You have specified subvolume '%s' " + "as the 'favorite child'. This means that if a discrepancy in the content " + "or attributes (ownership, permission, etc.) of a file is detected among " + "the subvolumes, the file on '%s' will be considered the definitive " + "version and its contents will OVERWRITE the contents of the file on other " + "subvolumes. All versions of the file except that on '%s' " + "WILL BE LOST."; -int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +int32_t +init (xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - - uint64_t ctx; - - int32_t op_errno = 0; - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - local->op_ret = -1; - - frame->local = local; - - loc_copy (&local->loc, loc); - - ret = inode_ctx_get (loc->inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ - - local->read_child_index = afr_read_child (this, loc->inode); - } else { - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + afr_private_t *priv = NULL; + int child_count = 0; + xlator_list_t *trav = NULL; + int i = 0; + int ret = -1; + GF_UNUSED int op_errno = 0; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + xlator_t *fav_child = NULL; + char *qtype = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "replicate translator needs more than one " + "subvolume defined."); + return -1; } - local->call_count = priv->child_count; - - local->child_up = memdup (priv->child_up, priv->child_count); - local->child_count = afr_up_children_count (priv->child_count, - local->child_up); - - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; - - if (xattr_req == NULL) - local->xattr_req = dict_new (); - else - local->xattr_req = dict_ref (xattr_req); - - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - - /* 3 = data+metadata+entry */ + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Volume is dangling."); } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); - - for (i = 0; i < priv->child_count; i++) { - STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, local->xattr_req); - } - - ret = 0; -out: - if (ret == -1) - AFR_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); - - return 0; -} - + this->private = GF_CALLOC (1, sizeof (afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; -/* {{{ open */ + priv = this->private; + LOCK_INIT (&priv->lock); -int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) -{ - afr_private_t * priv = NULL; + child_count = xlator_subvolume_count (this); - int op_ret = 0; - int ret = 0; + priv->child_count = child_count; - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + priv->read_child = -1; - priv = this->private; + GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out); - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &ctx); - - if (ret == 0) - goto out; + GF_OPTION_INIT ("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, bool, out); - fd_ctx = CALLOC (1, sizeof (afr_fd_ctx_t)); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - - op_ret = -ENOMEM; + GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); + if (read_subvol) { + priv->read_child = xlator_subvolume_index (this, read_subvol); + if (priv->read_child == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + read_subvol->name); goto out; } - - fd_ctx->child_failed = CALLOC (sizeof (*fd_ctx->child_failed), - priv->child_count); - - if (!fd_ctx->child_failed) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - - op_ret = -ENOMEM; + } + GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out); + if (read_subvol_index > -1) { + if (read_subvol_index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + read_subvol_index); goto out; } - - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); - if (ret < 0) { - op_ret = ret; - } + priv->read_child = read_subvol_index; } -out: - UNLOCK (&fd->lock); - - return ret; -} - - -int -afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) -{ - afr_local_t * local = frame->local; - int ret = 0; + GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); - ret = afr_fd_ctx_set (this, local->fd); + GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); - if (ret < 0) { - local->op_ret = -1; - local->op_errno = -ret; + priv->favorite_child = -1; + GF_OPTION_INIT ("favorite-child", fav_child, xlator, out); + if (fav_child) { + priv->favorite_child = xlator_subvolume_index (this, fav_child); + if (priv->favorite_child == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + fav_child->name); + goto out; + } + gf_log (this->name, GF_LOG_WARNING, + favorite_child_warning_str, fav_child->name, + fav_child->name, fav_child->name); } - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->fd); - return 0; -} - - -int -afr_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int ret = 0; - - int call_count = -1; - - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { - STACK_WIND (frame, afr_open_ftruncate_cbk, - this, this->fops->ftruncate, - fd, 0); - } else { - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set fd ctx for fd=%p", - fd); - - local->op_ret = -1; - local->op_errno = -ret; - } - - AFR_STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - } - } - - return 0; -} - - -int -afr_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, fd_t *fd) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int i = 0; - int ret = -1; - - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + GF_OPTION_INIT ("background-self-heal-count", + priv->background_self_heal_count, uint32, out); - priv = this->private; + GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out); - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - op_errno = EIO; - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - call_count = local->call_count; - - local->cont.open.flags = flags; - local->fd = fd_ref (fd); - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - loc, wind_flags, fd); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); - } - - return 0; -} - -/* }}} */ - -/* {{{ flush */ - -int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); - } - - return 0; -} + GF_OPTION_INIT ("data-self-heal-algorithm", + priv->data_self_heal_algorithm, str, out); + GF_OPTION_INIT ("data-self-heal-window-size", + priv->data_self_heal_window_size, uint32, out); -int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - afr_flush_unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} - - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int i = 0; - int call_count = -1; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - local->fd); - - if (!--call_count) - break; - } - } - - return 0; -} + GF_OPTION_INIT ("metadata-self-heal", priv->metadata_self_heal, bool, + out); + GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); -int -afr_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); - local = frame->local; + GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, + out); - local->transaction.unwind (frame, this); + GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out); - AFR_STACK_DESTROY (frame); + GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log, + bool, out); - return 0; -} + GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out); + GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); -int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); - call_frame_t * transaction_frame = NULL; + GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); + GF_OPTION_INIT ("quorum-type", qtype, str, out); + GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out); + GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64, + out); + fix_quorum_options(this,priv,qtype); - int ret = -1; + GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); + GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, + out); - int op_ret = -1; - int op_errno = 0; + GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); - priv = this->private; + priv->wait_count = 1; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; + priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, + gf_afr_mt_char); + if (!priv->child_up) { + ret = -ENOMEM; + goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; - - local->fd = fd_ref (fd); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; - - afr_transaction (transaction_frame, this, AFR_FLUSH_TRANSACTION); - - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - - return 0; -} - -/* }}} */ - - -int -afr_release (xlator_t *this, fd_t *fd) -{ - uint64_t ctx; - afr_fd_ctx_t * fd_ctx; - - int ret = 0; - - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) + for (i = 0; i < child_count; i++) + priv->child_up[i] = -1; /* start with unknown state. + this initialization needed + for afr_notify() to work + reliably + */ + + priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, + gf_afr_mt_xlator_t); + if (!priv->children) { + ret = -ENOMEM; goto out; - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx) { - if (fd_ctx->child_failed) - FREE (fd_ctx->child_failed); - - FREE (fd_ctx); } - -out: - return 0; -} - - -/* {{{ fsync */ - -int -afr_fsync_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fsync_cbk, - priv->children[i], - priv->children[i]->fops->fsync, - fd, datasync); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - -/* }}} */ - -/* {{{ fsync */ - -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fsync_cbk, - priv->children[i], - priv->children[i]->fops->fsyncdir, - fd, datasync); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - -/* }}} */ - -/* {{{ xattrop */ - -int32_t -afr_xattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); - - return 0; -} - - -int32_t -afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_xattrop_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - loc, optype, xattr); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - -/* }}} */ - -/* {{{ fxattrop */ - -int32_t -afr_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); - return 0; -} - - -int32_t -afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fxattrop_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - fd, optype, xattr); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - -/* }}} */ - - -int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct flock *flock) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_inodelk_cbk, - priv->children[i], - priv->children[i]->fops->inodelk, - volume, loc, cmd, flock); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - -int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct flock *flock) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_finodelk_cbk, - priv->children[i], - priv->children[i]->fops->finodelk, - volume, fd, cmd, flock); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - -int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_entrylk_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - - -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); - - return 0; -} - - -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fentrylk_cbk, - priv->children[i], - priv->children[i]->fops->fentrylk, - volume, fd, basename, cmd, type); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - -int32_t -afr_checksum_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - uint8_t *file_checksum, uint8_t *dir_checksum) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0 && (local->op_ret != 0)) { - local->op_ret = 0; - - local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX); - memcpy (local->cont.checksum.file_checksum, file_checksum, - ZR_FILENAME_MAX); - - local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX); - memcpy (local->cont.checksum.dir_checksum, dir_checksum, - ZR_FILENAME_MAX); - - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->cont.checksum.file_checksum, - local->cont.checksum.dir_checksum); - - return 0; -} - - -int32_t -afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flag) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; - - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = local->call_count; - frame->local = local; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_checksum_cbk, - priv->children[i], - priv->children[i]->fops->checksum, - loc, flag); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno); - } - return 0; -} - - -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs) -{ - afr_local_t *local = NULL; - - int call_count = 0; - - LOCK (&frame->lock); - { - local = frame->local; - - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } - } - - if (op_ret == -1) - local->op_errno = op_errno; - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf); - - return 0; -} - - -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) -{ - afr_private_t * priv = NULL; - int child_count = 0; - afr_local_t * local = NULL; - int i = 0; - - int ret = -1; - int call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - child_count = priv->child_count; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - call_count = local->call_count; - - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_statfs_cbk, - priv->children[i], - priv->children[i]->fops->statfs, - loc); - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - return 0; -} - - -int32_t -afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct flock *lock) -{ - afr_local_t * local = NULL; - - int call_count = -1; - - local = frame->local; - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - lock); - - return 0; -} - - -int32_t -afr_lk_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int i; - int call_count = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, - priv->child_count); - - if (call_count == 0) { - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->cont.lk.flock); - return 0; - } - - local->call_count = call_count; - - local->cont.lk.flock.l_type = F_UNLCK; - - for (i = 0; i < priv->child_count; i++) { - if (local->cont.lk.locked_nodes[i]) { - STACK_WIND (frame, afr_lk_unlock_cbk, - priv->children[i], - priv->children[i]->fops->lk, - local->fd, F_SETLK, - &local->cont.lk.flock); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int32_t -afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct flock *lock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - call_count = --local->call_count; - - if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { - local->op_ret = -1; - local->op_errno = op_errno; - - afr_lk_unlock (frame, this); - return 0; - } - - if (op_ret == 0) { - local->op_ret = 0; - local->op_errno = 0; - local->cont.lk.flock = *lock; - local->cont.lk.locked_nodes[child_index] = 1; - } - - child_index++; - - if (child_index < priv->child_count) { - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->lk, - local->fd, local->cont.lk.cmd, - &local->cont.lk.flock); - } else if (local->op_ret == -1) { - /* all nodes have gone down */ - - AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock); - } else { - /* locking has succeeded on all nodes that are up */ - - AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->cont.lk.flock); - } - - return 0; -} - - -int -afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, - struct flock *flock) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int i = 0; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - AFR_LOCAL_INIT (local, priv); - - frame->local = local; - - local->cont.lk.locked_nodes = CALLOC (priv->child_count, - sizeof (*local->cont.lk.locked_nodes)); - - if (!local->cont.lk.locked_nodes) { - gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - op_errno = ENOMEM; - goto out; - } - - local->fd = fd_ref (fd); - local->cont.lk.cmd = cmd; - local->cont.lk.flock = *flock; - - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, - priv->children[i], - priv->children[i]->fops->lk, - fd, cmd, flock); - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); - } - return 0; -} - - -/** - * find_child_index - find the child's index in the array of subvolumes - * @this: AFR - * @child: child - */ - -static int -find_child_index (xlator_t *this, xlator_t *child) -{ - afr_private_t *priv = NULL; - - int i = -1; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if ((xlator_t *) child == priv->children[i]) - break; - } - - return i; -} - - -int32_t -notify (xlator_t *this, int32_t event, - void *data, ...) -{ - afr_private_t * priv = NULL; - unsigned char * child_up = NULL; - - int i = -1; - int up_children = 0; - - priv = this->private; - - if (!priv) - return 0; - - child_up = priv->child_up; - - switch (event) { - case GF_EVENT_CHILD_UP: - i = find_child_index (this, data); - - child_up[i] = 1; - - /* - if all the children were down, and one child came up, - send notify to parent - */ - - for (i = 0; i < priv->child_count; i++) - if (child_up[i]) - up_children++; - - if (up_children == 1) { - gf_log (this->name, GF_LOG_NORMAL, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); - - default_notify (this, event, data); - } - - break; - - case GF_EVENT_CHILD_DOWN: - i = find_child_index (this, data); - - child_up[i] = 0; - - /* - if all children are down, and this was the last to go down, - send notify to parent - */ - - for (i = 0; i < priv->child_count; i++) - if (child_up[i]) - up_children++; + priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), + child_count, + gf_afr_mt_char); + if (!priv->pending_key) { + ret = -ENOMEM; + goto out; + } - if (up_children == 0) { - gf_log (this->name, GF_LOG_ERROR, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; - default_notify (this, event, data); + ret = gf_asprintf (&priv->pending_key[i], "%s.%s", + AFR_XATTR_PREFIX, + trav->xlator->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; } - break; - - default: - default_notify (this, event, data); - } - - return 0; -} - - -static const char *favorite_child_warning_str = "You have specified subvolume '%s' " - "as the 'favorite child'. This means that if a discrepancy in the content " - "or attributes (ownership, permission, etc.) of a file is detected among " - "the subvolumes, the file on '%s' will be considered the definitive " - "version and its contents will OVERWRITE the contents of the file on other " - "subvolumes. All versions of the file except that on '%s' " - "WILL BE LOST."; - -static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " - "This means correctness is NO LONGER GUARANTEED in all cases. If two or more " - "applications write to the same region of a file, there is a possibility that " - "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " - "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " - "RESPONSIBLE for inconsistent data. If you are in doubt, set it to a value " - "greater than 0."; - -int32_t -init (xlator_t *this) -{ - afr_private_t * priv = NULL; - int child_count = 0; - xlator_list_t * trav = NULL; - int i = 0; - int ret = -1; - int op_errno = 0; - - char * read_subvol = NULL; - char * fav_child = NULL; - char * self_heal = NULL; - char * change_log = NULL; - - int32_t lock_server_count = 1; - - int fav_ret = -1; - int read_ret = -1; - int dict_ret = -1; - - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "replicate translator needs more than one " - "subvolume defined."); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling."); - } - - ALLOC_OR_GOTO (this->private, afr_private_t, out); - - priv = this->private; - - read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); - priv->read_child = -1; - - fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); - priv->favorite_child = -1; - - /* Default values */ - - priv->data_self_heal = 1; - priv->metadata_self_heal = 1; - priv->entry_self_heal = 1; - - dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->data_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option data-self-heal %s'. " - "Defaulting to data-self-heal as 'on'", - self_heal); - priv->data_self_heal = 1; - } - } - - dict_ret = dict_get_str (this->options, "metadata-self-heal", - &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option metadata-self-heal %s'. " - "Defaulting to metadata-self-heal as 'on'.", - self_heal); - priv->metadata_self_heal = 1; - } - } - - dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->entry_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entry-self-heal %s'. " - "Defaulting to entry-self-heal as 'on'.", - self_heal); - priv->entry_self_heal = 1; - } - } - - /* Change log options */ - - priv->data_change_log = 1; - priv->metadata_change_log = 0; - priv->entry_change_log = 1; - - dict_ret = dict_get_str (this->options, "data-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->data_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option data-change-log %s'. " - "Defaulting to data-change-log as 'on'.", - change_log); - priv->data_change_log = 1; - } - } - - dict_ret = dict_get_str (this->options, "metadata-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, - &priv->metadata_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option metadata-change-log %s'. " - "Defaulting to metadata-change-log as 'off'.", - change_log); - priv->metadata_change_log = 0; - } - } - - dict_ret = dict_get_str (this->options, "entry-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->entry_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entry-change-log %s'. " - "Defaulting to entry-change-log as 'on'.", - change_log); - priv->entry_change_log = 1; - } - } - - /* Locking options */ - - priv->data_lock_server_count = 1; - priv->metadata_lock_server_count = 0; - priv->entry_lock_server_count = 1; - - dict_ret = dict_get_int32 (this->options, "data-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting data lock server count to %d.", - lock_server_count); - - if (lock_server_count == 0) - gf_log (this->name, GF_LOG_WARNING, - no_lock_servers_warning_str); - - priv->data_lock_server_count = lock_server_count; - } - - - dict_ret = dict_get_int32 (this->options, - "metadata-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting metadata lock server count to %d.", - lock_server_count); - priv->metadata_lock_server_count = lock_server_count; - } - - - dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting entry lock server count to %d.", - lock_server_count); - - priv->entry_lock_server_count = lock_server_count; - } - - trav = this->children; - while (trav) { - if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { - gf_log (this->name, GF_LOG_DEBUG, - "Subvolume '%s' specified as read child.", - trav->xlator->name); - - priv->read_child = child_count; - } - - if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { - gf_log (this->name, GF_LOG_WARNING, - favorite_child_warning_str, trav->xlator->name, - trav->xlator->name, trav->xlator->name); - priv->favorite_child = child_count; - } - - child_count++; - trav = trav->next; - } - - priv->wait_count = 1; - - priv->child_count = child_count; + trav = trav->next; + i++; + } - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); + ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, + this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } - priv->child_up = CALLOC (sizeof (unsigned char), child_count); - if (!priv->child_up) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), + gf_afr_mt_int32_t); + if (!priv->last_event) { + ret = -ENOMEM; + goto out; + } - priv->children = CALLOC (sizeof (xlator_t *), child_count); - if (!priv->children) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; + ret = afr_selfheal_daemon_init (this); + if (ret) { + ret = -ENOMEM; goto out; } - priv->pending_key = CALLOC (sizeof (*priv->pending_key), child_count); - if (!priv->pending_key) { + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 512); + if (!this->local_pool) { + ret = -1; gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; + "failed to create local_t's memory pool"); goto out; } - trav = this->children; - i = 0; - while (i < child_count) { - priv->children[i] = trav->xlator; - - asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, - trav->xlator->name); - - trav = trav->next; - i++; - } + priv->root_inode = NULL; - ret = 0; + ret = 0; out: - return ret; + return ret; } int fini (xlator_t *this) { - return 0; + afr_private_t *priv = NULL; + + priv = this->private; + this->private = NULL; + afr_priv_destroy (priv); + //if (this->itable);//I dont see any destroy func + + return 0; } struct xlator_fops fops = { - .lookup = afr_lookup, - .open = afr_open, - .lk = afr_lk, - .flush = afr_flush, - .statfs = afr_statfs, - .fsync = afr_fsync, - .fsyncdir = afr_fsyncdir, - .xattrop = afr_xattrop, - .fxattrop = afr_fxattrop, - .inodelk = afr_inodelk, - .finodelk = afr_finodelk, - .entrylk = afr_entrylk, - .fentrylk = afr_fentrylk, - .checksum = afr_checksum, - - /* inode read */ - .access = afr_access, - .stat = afr_stat, - .fstat = afr_fstat, - .readlink = afr_readlink, - .getxattr = afr_getxattr, - .readv = afr_readv, - - /* inode write */ - .chmod = afr_chmod, - .chown = afr_chown, - .fchmod = afr_fchmod, - .fchown = afr_fchown, - .writev = afr_writev, - .truncate = afr_truncate, - .ftruncate = afr_ftruncate, - .utimens = afr_utimens, - .setxattr = afr_setxattr, - .removexattr = afr_removexattr, - - /* dir read */ - .opendir = afr_opendir, - .readdir = afr_readdir, - .getdents = afr_getdents, - - /* dir write */ - .create = afr_create, - .mknod = afr_mknod, - .mkdir = afr_mkdir, - .unlink = afr_unlink, - .rmdir = afr_rmdir, - .link = afr_link, - .symlink = afr_symlink, - .rename = afr_rename, - .setdents = afr_setdents, + .lookup = afr_lookup, + .open = afr_open, + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, + .fsync = afr_fsync, + .fsyncdir = afr_fsyncdir, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, + .entrylk = afr_entrylk, + .fentrylk = afr_fentrylk, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, + + /* inode read */ + .access = afr_access, + .stat = afr_stat, + .fstat = afr_fstat, + .readlink = afr_readlink, + .getxattr = afr_getxattr, + .fgetxattr = afr_fgetxattr, + .readv = afr_readv, + + /* inode write */ + .writev = afr_writev, + .truncate = afr_truncate, + .ftruncate = afr_ftruncate, + .setxattr = afr_setxattr, + .fsetxattr = afr_fsetxattr, + .setattr = afr_setattr, + .fsetattr = afr_fsetattr, + .removexattr = afr_removexattr, + .fremovexattr = afr_fremovexattr, + + /* dir read */ + .opendir = afr_opendir, + .readdir = afr_readdir, + .readdirp = afr_readdirp, + + /* dir write */ + .create = afr_create, + .mknod = afr_mknod, + .mkdir = afr_mkdir, + .unlink = afr_unlink, + .rmdir = afr_rmdir, + .link = afr_link, + .symlink = afr_symlink, + .rename = afr_rename, }; -struct xlator_mops mops = { +struct xlator_dumpops dumpops = { + .priv = afr_priv_dump, }; struct xlator_cbks cbks = { .release = afr_release, + .releasedir = afr_releasedir, + .forget = afr_forget, }; struct volume_options options[] = { - { .key = {"read-subvolume" }, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"favorite-child"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"data-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"metadata-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"entry-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"data-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"metadata-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"entry-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"data-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 + { .key = {"read-subvolume" }, + .type = GF_OPTION_TYPE_XLATOR, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. Afr will prefer the one specified using " + "this option if it is not stale. Option value must be " + "one of the xlator names of the children. " + "Ex: <volname>-client-0 till " + "<volname>-client-<number-of-bricks - 1>" + }, + { .key = {"read-subvolume-index" }, + .type = GF_OPTION_TYPE_INT, + .default_value = "-1", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one specified using " + "this option if it is not stale. allowed options" + " include -1 till replica-count - 1" + }, + { .key = {"read-hash-mode" }, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 2, + .default_value = "1", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one computed using " + "the method specified using this option" + "0 = first up server, " + "1 = hash by GFID of file (all clients use " + "same subvolume), " + "2 = hash by GFID of file and client PID", + }, + { .key = {"choose-local" }, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Choose a local subvolume (i.e. Brick) to read from" + " if read-subvolume is not explicitly set.", + }, + { .key = {"favorite-child"}, + .type = GF_OPTION_TYPE_XLATOR, + .description = "If a split-brain happens choose subvol/brick set by " + "this option as source." + }, + { .key = {"background-self-heal-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "16", + .validate = GF_OPT_VALIDATE_MIN, + .description = "This specifies the number of self-heals that can be " + " performed in background without blocking the fop" + }, + { .key = {"data-self-heal"}, + .type = GF_OPTION_TYPE_STR, + .value = {"1", "on", "yes", "true", "enable", + "0", "off", "no", "false", "disable", + "open"}, + .default_value = "on", + .description = "Using this option we can enable/disable data " + "self-heal on the file. \"open\" means data " + "self-heal action will only be triggered by file " + "open operations." + }, + { .key = {"data-self-heal-algorithm"}, + .type = GF_OPTION_TYPE_STR, + .description = "Select between \"full\", \"diff\". The " + "\"full\" algorithm copies the entire file from " + "source to sink. The \"diff\" algorithm copies to " + "sink only those blocks whose checksums don't match " + "with those of source. If no option is configured " + "the option is chosen dynamically as follows: " + "If the file does not exist on one of the sinks " + "or empty file exists or if the source file size is " + "about the same as page size the entire file will " + "be read and written i.e \"full\" algo, " + "otherwise \"diff\" algo is chosen.", + .value = { "diff", "full"} + }, + { .key = {"data-self-heal-window-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, + .default_value = "1", + .description = "Maximum number blocks per file for which self-heal " + "process would be applied simultaneously." + }, + { .key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Using this option we can enable/disable metadata " + "i.e. Permissions, ownerships, xattrs self-heal on " + "the file/directory." + }, + { .key = {"entry-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Using this option we can enable/disable entry " + "self-heal on the directory." + }, + { .key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Data fops like write/truncate will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" + }, + { .key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Metadata fops like setattr/setxattr will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" + }, + { .key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Entry fops like create/unlink will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" + }, + { .key = {"optimistic-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Entry/Metadata fops will not perform " + "pre fop changelog operations in afr transaction " + "if this option is enabled." + }, + { .key = {"inodelk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs inode lock/unlocks" + }, + { .key = {"entrylk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs entry lock/unlocks" + }, + { .key = {"pre-op-compat"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Use separate pre-op xattrop() FOP rather than " + "overloading xdata of the OP" }, - { .key = {"metadata-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 + { .key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Lock phase of a transaction has two sub-phases. " + "First is an attempt to acquire locks in parallel by " + "broadcasting non-blocking lock requests. If lock " + "acquisition fails on any server, then the held locks " + "are unlocked and revert to a blocking locked mode " + "sequentially on one server after another. If this " + "option is enabled the initial broadcasting lock " + "request attempt to acquire lock on the entire file. " + "If this fails, we revert back to the sequential " + "\"regional\" blocking lock as before. In the case " + "where such an \"eager\" lock is granted in the " + "non-blocking phase, it gives rise to an opportunity " + "for optimization. i.e, if the next write transaction " + "on the same FD arrives before the unlock phase of " + "the first transaction, it \"takes over\" the full " + "file lock. Similarly if yet another data transaction " + "arrives before the unlock phase of the \"optimized\" " + "transaction, that in turn \"takes over\" the lock as " + "well. The actual unlock now happens at the end of " + "the last \"optimized\" transaction." + + }, + { .key = {"self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option applies to only self-heal-daemon. " + "Index directory crawl and automatic healing of files " + "will not be performed if this option is turned off." + }, + { .key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of self-heal-daemon " + "or not." + }, + { .key = {"quorum-type"}, + .type = GF_OPTION_TYPE_STR, + .value = { "none", "auto", "fixed"}, + .default_value = "none", + .description = "If value is \"fixed\" only allow writes if " + "quorum-count bricks are present. If value is " + "\"auto\" only allow writes if more than half of " + "bricks, or exactly half including the first, are " + "present.", + }, + { .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = INT_MAX, + .default_value = 0, + .description = "If quorum-type is \"fixed\" only allow writes if " + "this many bricks or present. Other quorum types " + "will OVERWRITE this value.", + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "Local glusterd uuid string, used in starting " + "self-heal-daemon so that it can crawl only on " + "local index directories.", + }, + { .key = {"post-op-delay-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "1", + .description = "Time interval induced artificially before " + "post-operation phase of the transaction to " + "enhance overlap of adjacent write operations.", + }, + { .key = {AFR_SH_READDIR_SIZE_KEY}, + .type = GF_OPTION_TYPE_SIZET, + .description = "readdirp size for performing entry self-heal", + .min = 1024, + .max = 131072, + .default_value = "1KB", + }, + { .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", + }, + { .key = {"afr-dirty-xattr"}, + .type = GF_OPTION_TYPE_STR, + .default_value = AFR_DIRTY_DEFAULT, }, - { .key = {"entry-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 + { .key = {"metadata-splitbrain-forced-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", }, - { .key = {NULL} }, + { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 2871bfc47..36042f7b2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -26,85 +17,147 @@ #include "config.h" #endif -#include "scheduler.h" #include "call-stub.h" #include "compat-errno.h" +#include "afr-mem-types.h" -#define AFR_XATTR_PREFIX "trusted.afr" +#include "libxlator.h" +#include "timer.h" +#include "syncop.h" -typedef struct _afr_private { - gf_lock_t lock; /* to guard access to child_count, etc */ - unsigned int child_count; /* total number of children */ +#include "afr-self-heald.h" - unsigned int read_child_rr; /* round-robin index of the read_child */ - gf_lock_t read_child_lock; /* lock to protect above */ - - xlator_t **children; +#define AFR_XATTR_PREFIX "trusted.afr" +#define AFR_PATHINFO_HEADER "REPLICATE:" +#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) - unsigned char *child_up; +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ - char **pending_key; +typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); - gf_boolean_t data_self_heal; /* on/off */ - gf_boolean_t metadata_self_heal; /* on/off */ - gf_boolean_t entry_self_heal; /* on/off */ +typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); - gf_boolean_t data_change_log; /* on/off */ - gf_boolean_t metadata_change_log; /* on/off */ - gf_boolean_t entry_change_log; /* on/off */ +typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err); - int read_child; /* read-subvolume */ - unsigned int favorite_child; /* subvolume to be preferred in resolving - split-brain cases */ +typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); - unsigned int data_lock_server_count; - unsigned int metadata_lock_server_count; - unsigned int entry_lock_server_count; +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) +#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) +#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) - unsigned int wait_count; /* # of servers to wait for success */ -} afr_private_t; +typedef struct _afr_private { + gf_lock_t lock; /* to guard access to child_count, etc */ + unsigned int child_count; /* total number of children */ -typedef struct { - /* array of stat's, one for each child */ - struct stat *buf; + xlator_t **children; - /* array of xattr's, one for each child */ - dict_t **xattr; + inode_t *root_inode; - /* array of errno's, one for each child */ - int *child_errno; + unsigned char *child_up; - int32_t **pending_matrix; - int32_t **delta_matrix; + char **pending_key; - int *sources; - int source; - int active_source; - int active_sinks; - int *success; + char *data_self_heal; /* on/off/open */ + char * data_self_heal_algorithm; /* name of algorithm */ + unsigned int data_self_heal_window_size; /* max number of pipelined + read/writes */ + + unsigned int background_self_heal_count; + unsigned int background_self_heals_started; + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + + gf_boolean_t data_change_log; /* on/off */ + gf_boolean_t metadata_change_log; /* on/off */ + gf_boolean_t entry_change_log; /* on/off */ + + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + int read_child; /* read-subvolume */ + unsigned int hash_mode; /* for when read_child is not set */ + int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ + + gf_boolean_t inodelk_trace; + gf_boolean_t entrylk_trace; + + unsigned int wait_count; /* # of servers to wait for success */ + + uint64_t up_count; /* number of CHILD_UPs we have seen */ + uint64_t down_count; /* number of CHILD_DOWNs we have seen */ + + gf_boolean_t optimistic_change_log; + gf_boolean_t eager_lock; + gf_boolean_t pre_op_compat; /* on/off */ + uint32_t post_op_delay_secs; + unsigned int quorum_count; + + char vol_uuid[UUID_SIZE + 1]; + int32_t *last_event; + + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; - fd_t *healing_fd; - int op_failed; + gf_boolean_t choose_local; + gf_boolean_t did_discovery; + uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; + char *afr_dirty; - int file_has_holes; - blksize_t block_size; - off_t file_size; - off_t offset; + afr_self_heald_t shd; - loc_t parent_loc; - int (*completion_cbk) (call_frame_t *frame, xlator_t *this); - call_frame_t *sh_frame; -} afr_self_heal_t; + /* pump dependencies */ + void *pump_private; + gf_boolean_t use_afr_in_pump; +} afr_private_t; typedef enum { - AFR_DATA_TRANSACTION, /* truncate, write, ... */ - AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ - AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ - AFR_ENTRY_RENAME_TRANSACTION, /* rename */ - AFR_FLUSH_TRANSACTION, /* flush */ + AFR_DATA_TRANSACTION, /* truncate, write, ... */ + AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ + AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ } afr_transaction_type; +typedef enum { + AFR_TRANSACTION_LK, + AFR_SELFHEAL_LK, +} transaction_lk_type_t; + +typedef enum { + AFR_LOCK_OP, + AFR_UNLOCK_OP, +} afr_lock_op_type_t; + +typedef enum { + AFR_DATA_SELF_HEAL_LK, + AFR_METADATA_SELF_HEAL_LK, + AFR_ENTRY_SELF_HEAL_LK, +}selfheal_lk_type_t; + +typedef enum { + AFR_INODELK_TRANSACTION, + AFR_INODELK_NB_TRANSACTION, + AFR_ENTRYLK_TRANSACTION, + AFR_ENTRYLK_NB_TRANSACTION, + AFR_INODELK_SELFHEAL, + AFR_INODELK_NB_SELFHEAL, + AFR_ENTRYLK_SELFHEAL, + AFR_ENTRYLK_NB_SELFHEAL, +} afr_lock_call_type_t; /* xattr format: trusted.afr.volume = [x y z] @@ -117,9 +170,8 @@ static inline int afr_index_for_transaction_type (afr_transaction_type type) { switch (type) { - + case AFR_DATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: return 0; case AFR_METADATA_TRANSACTION: @@ -133,352 +185,636 @@ afr_index_for_transaction_type (afr_transaction_type type) return -1; /* make gcc happy */ } +typedef struct { + loc_t loc; + char *basename; + unsigned char *locked_nodes; + int locked_count; -typedef struct _afr_local { - unsigned int call_count; - unsigned int success_count; - unsigned int enoent_count; +} afr_entry_lockee_t; - unsigned int need_metadata_self_heal; - unsigned int need_entry_self_heal; - unsigned int need_data_self_heal; - unsigned int govinda_gOvinda; +int +afr_entry_lockee_cmp (const void *l1, const void *l2); - unsigned int read_child_index; +typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; - pid_t saved_pid; +typedef struct { + loc_t *lk_loc; + + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; + const char *lk_basename; + const char *lower_basename; + const char *higher_basename; + char lower_locked; + char higher_locked; + + unsigned char *locked_nodes; + unsigned char *lower_locked_nodes; + + selfheal_lk_type_t selfheal_lk_type; + transaction_lk_type_t transaction_lk_type; + + int32_t lock_count; + int32_t entrylk_lock_count; + + uint64_t lock_number; + int32_t lk_call_count; + int32_t lk_expected_count; + int32_t lk_attempted_count; + + int32_t lock_op_ret; + int32_t lock_op_errno; + afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ +} afr_internal_lock_t; + +struct afr_reply { + int valid; + int32_t op_ret; + int32_t op_errno; + dict_t *xdata; + struct iatt poststat; + struct iatt postparent; + struct iatt prestat; + struct iatt preparent; + struct iatt preparent2; + struct iatt postparent2; + uint8_t checksum[MD5_DIGEST_LENGTH]; +}; - int32_t op_ret; - int32_t op_errno; +typedef enum { + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; - int32_t **pending; +typedef struct { + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ - loc_t loc; - loc_t newloc; + unsigned int *lock_piggyback; + unsigned int *lock_acquired; - fd_t *fd; + int flags; - glusterfs_fop_t fop; + /* used for delayed-post-op optimization */ + pthread_mutex_t delay_lock; + gf_timer_t *delay_timer; + call_frame_t *delay_frame; - unsigned char *child_up; - int child_count; + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; - int32_t *child_errno; - - dict_t *xattr_req; - int open_fd_count; - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. */ + uint32_t open_fd_count; - int op; - struct { - struct { - unsigned char buf_set; - struct statvfs buf; - } statfs; - struct { - inode_t *inode; - struct stat buf; - dict_t *xattr; - } lookup; + /* list of frames currently in progress */ + struct list_head eager_locked; +} afr_fd_ctx_t; - struct { - int32_t flags; - } open; - struct { - int32_t cmd; - struct flock flock; - unsigned char *locked_nodes; - } lk; +typedef struct _afr_local { + glusterfs_fop_t op; + unsigned int call_count; - struct { - uint8_t *file_checksum; - uint8_t *dir_checksum; - } checksum; + /* @event_generation: copy of priv->event_generation taken at the + time of starting the transaction. The copy is made so that we + have a stable value through the various phases of the transaction. + */ + unsigned int event_generation; - /* inode read */ + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - struct { - int32_t mask; - int last_tried; /* index of the child we tried previously */ - } access; + gf_lkowner_t saved_lk_owner; - struct { - int last_tried; - ino_t ino; - } stat; + int32_t op_ret; + int32_t op_errno; - struct { - int last_tried; - ino_t ino; - } fstat; + int32_t **pending; - struct { - size_t size; - int last_tried; - } readlink; + int dirty[AFR_NUM_CHANGE_LOGS]; - struct { - const char *name; - int last_tried; - } getxattr; + loc_t loc; + loc_t newloc; - struct { - size_t size; - off_t offset; - int last_tried; - } readv; + fd_t *fd; + afr_fd_ctx_t *fd_ctx; - /* dir read */ + /* @child_up: copy of priv->child_up taken at the time of transaction + start. The copy is taken so that we have a stable child_up array + through the phases of the transaction as priv->child_up[i] can keep + changing through time. + */ + unsigned char *child_up; + + /* @read_attempted: + array of flags representing subvolumes where read operations of + the read transaction have already been attempted. The array is + first pre-filled with down subvolumes, and as reads are performed + on other subvolumes, those are set as well. This way if the read + operation fails we do not retry on that subvolume again. + */ + unsigned char *read_attempted; - struct { - int success_count; - int32_t op_ret; - int32_t op_errno; - } opendir; + /* @readfn: - struct { - int32_t op_ret; - int32_t op_errno; - size_t size; - off_t offset; + pointer to function which will perform the read operation on a given + subvolume. Used in read transactions. + */ - int last_tried; - } readdir; + afr_read_txn_wind_t readfn; - struct { - int32_t op_ret; - int32_t op_errno; + /* @refreshed: - size_t size; - off_t offset; - int32_t flag; + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; - int last_tried; - } getdents; + /* @inode: - /* inode write */ + the inode on which the read txn is performed on. ref'ed and copied + from either fd->inode or loc.inode + */ - struct { - ino_t ino; - mode_t mode; - struct stat buf; - } chmod; + inode_t *inode; - struct { - ino_t ino; - mode_t mode; - struct stat buf; - } fchmod; + /* @parent[2]: - struct { - ino_t ino; - uid_t uid; - gid_t gid; - struct stat buf; - } chown; + parent inode[s] on which directory transactions are performed. + */ - struct { - ino_t ino; - uid_t uid; - gid_t gid; - struct stat buf; - } fchown; - - struct { - ino_t ino; - struct stat buf; + inode_t *parent; + inode_t *parent2; - int32_t op_ret; + /* @readable: - struct iovec *vector; - struct iobref *iobref; - int32_t count; - off_t offset; - } writev; + array of flags representing servers from which a read can be + performed. This is the output of afr_inode_refresh() + */ + unsigned char *readable; + + afr_inode_refresh_cbk_t refreshfn; + + /* @refreshinode: + + Inode currently getting refreshed. + */ + inode_t *refreshinode; + + /* + @pre_op_compat: + + compatibility mode of pre-op. send a separate pre-op and + op operations as part of transaction, rather than combining + */ + + gf_boolean_t pre_op_compat; + + dict_t *xattr_req; + + afr_internal_lock_t internal_lock; + + dict_t *dict; + + int optimistic_change_log; + gf_boolean_t delayed_post_op; + + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; + + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops + */ + + struct { + struct { + unsigned char buf_set; + struct statvfs buf; + } statfs; + + struct { + int32_t flags; + } open; + + struct { + int32_t cmd; + struct gf_flock user_flock; + struct gf_flock ret_flock; + unsigned char *locked_nodes; + } lk; + + /* inode read */ + + struct { + int32_t mask; + int last_index; /* index of the child we tried previously */ + } access; + + struct { + int last_index; + } stat; + + struct { + int last_index; + } fstat; + + struct { + size_t size; + int last_index; + } readlink; + + struct { + char *name; + int last_index; + long xattr_len; + } getxattr; + + struct { + size_t size; + off_t offset; + int last_index; + uint32_t flags; + } readv; + + /* dir read */ + + struct { + int success_count; + int32_t op_ret; + int32_t op_errno; + + uint32_t *checksum; + } opendir; + + struct { + int32_t op_ret; + int32_t op_errno; + size_t size; + off_t offset; + dict_t *dict; + gf_boolean_t failed; + int last_index; + } readdir; + /* inode write */ + + struct { + struct iatt prebuf; + struct iatt postbuf; + } inode_wfop; //common structure for all inode-write-fops + + struct { + int32_t op_ret; + + struct iovec *vector; + struct iobref *iobref; + int32_t count; + off_t offset; + uint32_t flags; + } writev; + + struct { + off_t offset; + } truncate; + + struct { + off_t offset; + } ftruncate; + + struct { + struct iatt in_buf; + int32_t valid; + } setattr; + + struct { + struct iatt in_buf; + int32_t valid; + } fsetattr; + + struct { + dict_t *dict; + int32_t flags; + } setxattr; + + struct { + dict_t *dict; + int32_t flags; + } fsetxattr; + + struct { + char *name; + } removexattr; + + struct { + dict_t *xattr; + } xattrop; + + struct { + dict_t *xattr; + } fxattrop; + + /* dir write */ + + struct { + inode_t *inode; + struct iatt buf; + struct iatt preparent; + struct iatt postparent; + struct iatt prenewparent; + struct iatt postnewparent; + } dir_fop; //common structure for all dir fops + + struct { + fd_t *fd; + dict_t *params; + int32_t flags; + mode_t mode; + } create; + + struct { + dev_t dev; + mode_t mode; + dict_t *params; + } mknod; + + struct { + int32_t mode; + dict_t *params; + } mkdir; + + struct { + int flags; + } rmdir; + + struct { + dict_t *params; + char *linkpath; + } symlink; struct { - ino_t ino; + int32_t mode; off_t offset; - struct stat buf; - } truncate; + size_t len; + } fallocate; struct { - ino_t ino; off_t offset; - struct stat buf; - } ftruncate; + size_t len; + } discard; - struct { - ino_t ino; - struct timespec tv[2]; - struct stat buf; - } utimens; + struct { + off_t offset; + off_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; - struct { - dict_t *dict; - int32_t flags; - } setxattr; - struct { - const char *name; - } removexattr; + } cont; - /* dir write */ - - struct { - ino_t ino; - fd_t *fd; - int32_t flags; - mode_t mode; - inode_t *inode; - struct stat buf; - } create; + struct { + off_t start, len; - struct { - ino_t ino; - dev_t dev; - mode_t mode; - inode_t *inode; - struct stat buf; - } mknod; + gf_boolean_t eager_lock_on; + int *eager_lock; - struct { - ino_t ino; - int32_t mode; - inode_t *inode; - struct stat buf; - } mkdir; + char *basename; + char *new_basename; - struct { - int32_t op_ret; - int32_t op_errno; - } unlink; + loc_t parent_loc; + loc_t new_parent_loc; - struct { - int32_t op_ret; - int32_t op_errno; - } rmdir; + afr_transaction_type type; - struct { - ino_t ino; - struct stat buf; - } rename; + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; - struct { - ino_t ino; - inode_t *inode; - struct stat buf; - } link; + struct list_head eager_locked; - struct { - ino_t ino; - inode_t *inode; - struct stat buf; - char *linkpath; - } symlink; + unsigned char *pre_op; - struct { - int32_t flags; - dir_entry_t *entries; - int32_t count; - } setdents; - } cont; - - struct { - off_t start, len; + /* @fop_subvols: subvolumes on which FOP will be attempted */ + unsigned char *fop_subvols; - unsigned char *locked_nodes; - int lock_count; + /* @failed_subvols: subvolumes on which FOP failed. Always + a subset of @fop_subvols */ + unsigned char *failed_subvols; - const char *basename; - const char *new_basename; + /* @dirtied: flag which indicates whether we set dirty flag + in the OP. Typically true when we are performing operation + on more than one subvol and optimistic changelog is disabled - loc_t parent_loc; - loc_t new_parent_loc; + A 'true' value set in @dirtied flag means an 'undirtying' + has to be done in POST-OP phase. + */ + gf_boolean_t dirtied; - afr_transaction_type type; + /* @inherited: flag which indicates that the dirty flags + of the previous transaction were inherited + */ + gf_boolean_t inherited; - int success_count; - int erase_pending; - int failure_count; + /* + @no_uninherit: flag which indicates that a pre_op_uninherit() + must _not_ be attempted (and returned as failure) always. This + flag is set when a hard pre-op is performed, but not accounted + for it in fd_ctx->on_disk[]. Such transactions are "isolated" + from the pre-op piggybacking entirely and therefore uninherit + must not be attempted. + */ + gf_boolean_t no_uninherit; - int last_tried; - int32_t *child_errno; + /* @uninherit_done: + @uninherit_value: - call_frame_t *main_frame; + The above pair variables make pre_op_uninherit() idempotent. + Both are FALSE initially. The first call to pre_op_uninherit + sets @uninherit_done to TRUE and the return value to + @uninherit_value. Further calls will check for @uninherit_done + to be TRUE and if so will simply return @uninherit_value. + */ + gf_boolean_t uninherit_done; + gf_boolean_t uninherit_value; - int (*fop) (call_frame_t *frame, xlator_t *this); + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ - int (*done) (call_frame_t *frame, xlator_t *this); + afr_changelog_resume_t changelog_resume; - int (*resume) (call_frame_t *frame, xlator_t *this); + call_frame_t *main_frame; - int (*unwind) (call_frame_t *frame, xlator_t *this); - } transaction; + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); - afr_self_heal_t self_heal; -} afr_local_t; + int (*fop) (call_frame_t *frame, xlator_t *this); + int (*done) (call_frame_t *frame, xlator_t *this); -typedef struct { - unsigned char pre_op_done; - unsigned char *child_failed; -} afr_fd_ctx_t; + int (*resume) (call_frame_t *frame, xlator_t *this); + + int (*unwind) (call_frame_t *frame, xlator_t *this); + + /* post-op hook */ + } transaction; + + syncbarrier_t barrier; + + struct marker_str marker; + /* extra data for fops */ + dict_t *xdata_req; + dict_t *xdata_rsp; -/* try alloc and if it fails, goto label */ -#define ALLOC_OR_GOTO(var, type, label) do { \ - var = CALLOC (sizeof (type), 1); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ - } while (0); + mode_t umask; + int xflag; + gf_boolean_t do_discovery; + struct afr_reply *replies; +} afr_local_t; /* did a call fail due to a child failing? */ -#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ - ((op_errno == ENOTCONN) || \ - (op_errno == EBADFD))) +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ + ((op_errno == ENOTCONN) || \ + (op_errno == EBADFD))) -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); -/* have we tried all children? */ -#define all_tried(i, count) ((i) == (count) - 1) +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvol, + int event_generation); +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int event_generation); int -afr_fd_ctx_set (xlator_t *this, fd_t *fd); +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this); -uint64_t -afr_read_child (xlator_t *this, inode_t *inode); +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable); + +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type); + +#define afr_data_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION) + +#define afr_metadata_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION) + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t cbk); + +int32_t +afr_notify (xlator_t *this, int32_t event, void *data, void *data2); + +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count); void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child); +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); + +int +afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); + +int +afr_mark_locked_nodes (xlator_t *this, fd_t *fd, + unsigned char *locked_nodes); void -afr_build_parent_loc (loc_t *parent, loc_t *child); +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); int -afr_up_children_count (int child_count, unsigned char *child_up); +afr_set_lock_number (call_frame_t *frame, xlator_t *this); + +int32_t +afr_unlock (call_frame_t *frame, xlator_t *this); int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); +afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this); + +int +afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this); + +int +afr_blocking_lock (call_frame_t *frame, xlator_t *this); + +int +afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); + +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count); int -afr_first_up_child (afr_private_t *priv); +__afr_fd_ctx_set (xlator_t *this, fd_t *fd); + +int +afr_fd_ctx_set (xlator_t *this, fd_t *fd); -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); + +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); int -afr_deitransform (ino64_t ino, int child_count); +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode); + +void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv); void afr_local_cleanup (afr_local_t *local, xlator_t *this); @@ -486,101 +822,155 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this); int afr_frame_return (call_frame_t *frame); -#define AFR_STACK_UNWIND(frame, params ...) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - STACK_UNWIND (frame, params); \ - afr_local_cleanup (__local, __this); \ - free (__local); \ -} while (0); - -#define AFR_STACK_DESTROY(frame) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - afr_local_cleanup (__local, __this); \ - free (__local); \ -} while (0); +int +afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +void +afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); + +int +afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); + +#define AFR_STACK_UNWIND(fop, frame, params ...) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + if (frame) { \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ + } while (0) + +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ + } while (0); + +#define AFR_FRAME_INIT(frame, op_errno) \ + ({frame->local = mem_get0 (THIS->local_pool); \ + if (afr_local_init (frame->local, THIS->private, &op_errno)) { \ + afr_local_cleanup (frame->local, THIS); \ + mem_put (frame->local); \ + frame->local = NULL; }; \ + frame->local;}) + +#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0) /* allocate and return a string that is the basename of argument */ -static inline char * -AFR_BASENAME (const char *str) +static inline char * +AFR_BASENAME (const char *str) { - char *__tmp_str = NULL; - char *__basename_str = NULL; - __tmp_str = strdup (str); - __basename_str = strdup (basename (__tmp_str)); - FREE (__tmp_str); - return __basename_str; + char *__tmp_str = NULL; + char *__basename_str = NULL; + __tmp_str = gf_strdup (str); + __basename_str = gf_strdup (basename (__tmp_str)); + GF_FREE (__tmp_str); + return __basename_str; } -/* initialize local_t */ -static inline int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) -{ - local->child_up = CALLOC (sizeof (*local->child_up), - priv->child_count); - if (!local->child_up) { - return -ENOMEM; - } +call_frame_t * +afr_copy_frame (call_frame_t *base); - memcpy (local->child_up, priv->child_up, - sizeof (*local->child_up) * priv->child_count); +int +afr_transaction_local_init (afr_local_t *local, xlator_t *this); +int32_t +afr_marker_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); - local->call_count = afr_up_children_count (priv->child_count, local->child_up); - if (local->call_count == 0) - return -ENOTCONN; +int +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); - local->transaction.erase_pending = 1; +int +afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, + transaction_lk_type_t lk_type); - local->op_ret = -1; - local->op_errno = EUCLEAN; +int +afr_higher_errno (int32_t old_errno, int32_t new_errno); - return 0; -} +int +afr_final_errno (afr_local_t *local, afr_private_t *priv); +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req); -static inline int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) -{ - int i; - - local->child_errno = CALLOC (sizeof (*local->child_errno), - priv->child_count); - if (!local->child_errno) { - return -ENOMEM; - } - - local->pending = CALLOC (sizeof (*local->pending), - priv->child_count); - - if (!local->pending) { - return -ENOMEM; - } - - for (i = 0; i < priv->child_count; i++) { - local->pending[i] = CALLOC (sizeof (*local->pending[i]), - 3); /* data + metadata + entry */ - if (!local->pending[i]) - return -ENOMEM; - } - - local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes), - priv->child_count); +void +afr_fix_open (fd_t *fd, xlator_t *this); - local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno), - priv->child_count); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); - return 0; -} +void +afr_set_low_priority (call_frame_t *frame); +int +afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, + int flags); +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv); + +void +afr_matrix_cleanup (int32_t **pending, unsigned int m); + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n); + +void +afr_filter_xattrs (dict_t *xattr); + +/* + * Special value indicating we should use the "auto" quorum method instead of + * a fixed value (including zero to turn off quorum enforcement). + */ +#define AFR_QUORUM_AUTO INT_MAX + +/* + * Having this as a macro will make debugging a bit weirder, but does reduce + * the probability of functions handling this check inconsistently. + */ +#define QUORUM_CHECK(_func,_label) do { \ + if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ + gf_log(this->name,GF_LOG_WARNING, \ + "failing "#_func" due to lack of quorum"); \ + op_errno = EROFS; \ + goto _label; \ + } \ +} while (0); + +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); + +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); + +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local); + +void +afr_remove_eager_lock_stub (afr_local_t *local); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c new file mode 100644 index 000000000..eed509956 --- /dev/null +++ b/xlators/cluster/afr/src/pump.c @@ -0,0 +1,2473 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> +#include <sys/time.h> +#include <stdlib.h> +#include <fnmatch.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr-common.c" +#include "defaults.c" +#include "glusterfs.h" +#include "pump.h" + + +static int +afr_set_dict_gfid (dict_t *dict, uuid_t gfid) +{ + int ret = 0; + uuid_t *pgfid = NULL; + + GF_ASSERT (gfid); + + pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); + if (!pgfid) { + ret = -1; + goto out; + } + + uuid_copy (*pgfid, gfid); + + ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + +out: + if (ret && pgfid) + GF_FREE (pgfid); + return ret; +} + +static int +afr_set_root_gfid (dict_t *dict) +{ + uuid_t gfid; + int ret = 0; + + memset (gfid, 0, 16); + gfid[15] = 1; + + ret = afr_set_dict_gfid (dict, gfid); + + return ret; +} + +static int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + uuid_t pargfid = {0}; + + if (!child) + goto out; + + if (!uuid_is_null (parent->inode->gfid)) + uuid_copy (pargfid, parent->inode->gfid); + else if (!uuid_is_null (parent->gfid)) + uuid_copy (pargfid, parent->gfid); + + if (uuid_is_null (pargfid)) + goto out; + + if (strcmp (parent->path, "/") == 0) + ret = gf_asprintf ((char **)&child->path, "/%s", name); + else + ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, + name); + + if (-1 == ret) { + gf_log (this->name, GF_LOG_ERROR, + "asprintf failed while setting child path"); + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + uuid_copy (child->pargfid, pargfid); + + if (!child->inode) { + ret = -1; + goto out; + } + + ret = 0; +out: + if ((ret == -1) && child) + loc_wipe (child); + + return ret; +} + +static void +afr_build_root_loc (xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = NULL; + + priv = this->private; + loc->path = gf_strdup ("/"); + loc->name = ""; + loc->inode = inode_ref (priv->root_inode); + uuid_copy (loc->gfid, loc->inode->gfid); +} + +static void +afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) +{ + GF_ASSERT (loc); + GF_ASSERT (buf); + + uuid_copy (loc->gfid, buf->ia_gfid); + if (postparent) + uuid_copy (loc->pargfid, postparent->ia_gfid); +} + +static uint64_t pump_pid = 0; +static inline void +pump_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent) +{ + afr_update_loc_gfids (loc, iatt, parent); + uuid_copy (loc->inode->gfid, iatt->ia_gfid); +} + +static int +pump_mark_start_pending (xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + pump_priv = priv->pump_private; + + pump_priv->pump_start_pending = 1; + + return 0; +} + +static int +is_pump_start_pending (xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + pump_priv = priv->pump_private; + + return (pump_priv->pump_start_pending); +} + +static int +pump_remove_start_pending (xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + pump_priv = priv->pump_private; + + pump_priv->pump_start_pending = 0; + + return 0; +} + +static pump_state_t +pump_get_state () +{ + xlator_t *this = NULL; + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + pump_state_t ret; + + this = THIS; + priv = this->private; + pump_priv = priv->pump_private; + + LOCK (&pump_priv->pump_state_lock); + { + ret = pump_priv->pump_state; + } + UNLOCK (&pump_priv->pump_state_lock); + + return ret; +} + +int +pump_change_state (xlator_t *this, pump_state_t state) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + pump_state_t state_old; + pump_state_t state_new; + + + priv = this->private; + pump_priv = priv->pump_private; + + GF_ASSERT (pump_priv); + + LOCK (&pump_priv->pump_state_lock); + { + state_old = pump_priv->pump_state; + state_new = state; + + pump_priv->pump_state = state; + + } + UNLOCK (&pump_priv->pump_state_lock); + + gf_log (this->name, GF_LOG_DEBUG, + "Pump changing state from %d to %d", + state_old, + state_new); + + return 0; +} + +static int +pump_set_resume_path (xlator_t *this, const char *path) +{ + int ret = 0; + + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + pump_priv = priv->pump_private; + + GF_ASSERT (pump_priv); + + LOCK (&pump_priv->resume_path_lock); + { + strncpy (pump_priv->resume_path, path, strlen (path) + 1); + } + UNLOCK (&pump_priv->resume_path_lock); + + return ret; +} + +static int +pump_save_path (xlator_t *this, const char *path) +{ + afr_private_t *priv = NULL; + pump_state_t state; + dict_t *dict = NULL; + loc_t loc = {0}; + int dict_ret = 0; + int ret = -1; + + state = pump_get_state (); + if (state == PUMP_STATE_RESUME) + return 0; + + priv = this->private; + + GF_ASSERT (priv->root_inode); + + afr_build_root_loc (this, &loc); + + dict = dict_new (); + dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", path, PUMP_PATH); + + ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); + + if (ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "setxattr failed - could not save path=%s", path); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "setxattr succeeded - saved path=%s", path); + } + + dict_unref (dict); + + loc_wipe (&loc); + return 0; +} + +static int +pump_check_and_update_status (xlator_t *this) +{ + pump_state_t state; + int ret = -1; + + state = pump_get_state (); + + switch (state) { + + case PUMP_STATE_RESUME: + case PUMP_STATE_RUNNING: + { + ret = 0; + break; + } + case PUMP_STATE_PAUSE: + { + ret = -1; + break; + } + case PUMP_STATE_ABORT: + { + pump_save_path (this, "/"); + ret = -1; + break; + } + default: + { + gf_log (this->name, GF_LOG_DEBUG, + "Unknown pump state"); + ret = -1; + break; + } + + } + + return ret; +} + +static const char * +pump_get_resume_path (xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + const char *resume_path = NULL; + + priv = this->private; + pump_priv = priv->pump_private; + + resume_path = pump_priv->resume_path; + + return resume_path; +} + +static int +pump_update_resume_state (xlator_t *this, const char *path) +{ + pump_state_t state; + const char *resume_path = NULL; + + state = pump_get_state (); + + if (state == PUMP_STATE_RESUME) { + resume_path = pump_get_resume_path (this); + if (strcmp (resume_path, "/") == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Reached the resume path (/). Proceeding to change state" + " to running"); + pump_change_state (this, PUMP_STATE_RUNNING); + } else if (strcmp (resume_path, path) == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Reached the resume path. Proceeding to change state" + " to running"); + pump_change_state (this, PUMP_STATE_RUNNING); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "Not yet hit the resume path:res-path=%s,path=%s", + resume_path, path); + } + } + + return 0; +} + +static gf_boolean_t +is_pump_traversal_allowed (xlator_t *this, const char *path) +{ + pump_state_t state; + const char *resume_path = NULL; + gf_boolean_t ret = _gf_true; + + state = pump_get_state (); + + if (state == PUMP_STATE_RESUME) { + resume_path = pump_get_resume_path (this); + if (strstr (resume_path, path)) { + gf_log (this->name, GF_LOG_DEBUG, + "On the right path to resumption path"); + ret = _gf_true; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "Not the right path to resuming=> ignoring traverse"); + ret = _gf_false; + } + } + + return ret; +} + +static int +pump_save_file_stats (xlator_t *this, const char *path) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + pump_priv = priv->pump_private; + + LOCK (&pump_priv->resume_path_lock); + { + pump_priv->number_files_pumped++; + + strncpy (pump_priv->current_file, path, + PATH_MAX); + } + UNLOCK (&pump_priv->resume_path_lock); + + return 0; +} + +static int +gf_pump_traverse_directory (loc_t *loc) +{ + xlator_t *this = NULL; + fd_t *fd = NULL; + off_t offset = 0; + loc_t entry_loc = {0}; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + gf_dirent_t entries; + struct iatt iatt = {0}; + struct iatt parent = {0}; + dict_t *xattr_rsp = NULL; + int ret = 0; + gf_boolean_t is_directory_empty = _gf_true; + gf_boolean_t free_entries = _gf_false; + + INIT_LIST_HEAD (&entries.list); + this = THIS; + + GF_ASSERT (loc->inode); + + fd = fd_create (loc->inode, pump_pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create fd for %s", loc->path); + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "opendir failed on %s", loc->path); + goto out; + } + + gf_log (this->name, GF_LOG_TRACE, + "pump opendir on %s returned=%d", + loc->path, ret); + + while (syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) { + free_entries = _gf_true; + + if (list_empty (&entries.list)) { + gf_log (this->name, GF_LOG_TRACE, + "no more entries in directory"); + goto out; + } + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + gf_log (this->name, GF_LOG_DEBUG, + "found readdir entry=%s", entry->d_name); + + offset = entry->d_off; + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " + "gfid present skipping", + loc->path, entry->d_name); + continue; + } + loc_wipe (&entry_loc); + ret = afr_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) + goto out; + + if ((strcmp (entry->d_name, ".") == 0) || + (strcmp (entry->d_name, "..") == 0)) + continue; + + is_directory_empty = _gf_false; + gf_log (this->name, GF_LOG_DEBUG, + "lookup %s => %"PRId64, + entry_loc.path, + iatt.ia_ino); + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + &xattr_rsp, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: lookup failed", entry_loc.path); + continue; + } + + ret = afr_selfheal_name (this, loc->gfid, entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: name self-heal failed (%s/%s)", + entry_loc.path, uuid_utoa (loc->gfid), + entry->d_name); + continue; + } + + ret = afr_selfheal (this, iatt.ia_gfid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: self-heal failed (%s)", + entry_loc.path, uuid_utoa (iatt.ia_gfid)); + continue; + } + + pump_fill_loc_info (&entry_loc, &iatt, &parent); + + pump_update_resume_state (this, entry_loc.path); + + pump_save_path (this, entry_loc.path); + pump_save_file_stats (this, entry_loc.path); + + ret = pump_check_and_update_status (this); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Pump beginning to exit out"); + goto out; + } + + if (IA_ISDIR (iatt.ia_type)) { + if (is_pump_traversal_allowed (this, entry_loc.path)) { + gf_log (this->name, GF_LOG_TRACE, + "entering dir=%s", entry->d_name); + gf_pump_traverse_directory (&entry_loc); + } + } + } + + gf_dirent_free (&entries); + free_entries = _gf_false; + gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d", + (int32_t ) offset); + + } + + ret = syncop_close (fd); + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); + + if (is_directory_empty && (strcmp (loc->path, "/") == 0)) { + pump_change_state (this, PUMP_STATE_RUNNING); + gf_log (this->name, GF_LOG_INFO, "Empty source brick. " + "Nothing to be done."); + } + +out: + if (entry_loc.path) + loc_wipe (&entry_loc); + if (free_entries) + gf_dirent_free (&entries); + return 0; +} + +static int +pump_update_resume_path (xlator_t *this) +{ + const char *resume_path = NULL; + + resume_path = pump_get_resume_path (this); + + if (resume_path) { + gf_log (this->name, GF_LOG_DEBUG, + "Found a path to resume from: %s", + resume_path); + + }else { + gf_log (this->name, GF_LOG_DEBUG, + "Did not find a path=> setting to '/'"); + pump_set_resume_path (this, "/"); + } + + pump_change_state (this, PUMP_STATE_RESUME); + + return 0; +} + +static int32_t +pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_private_t *priv = NULL; + loc_t loc = {0}; + int i = 0; + int ret = 0; + int source = 0; + int sink = 1; + + priv = this->private; + + afr_build_root_loc (this, &loc); + + ret = syncop_removexattr (priv->children[source], &loc, + PUMP_PATH, 0); + + ret = syncop_removexattr (priv->children[sink], &loc, + PUMP_SINK_COMPLETE, 0); + + for (i = 0; i < priv->child_count; i++) { + ret = syncop_removexattr (priv->children[i], &loc, + PUMP_SOURCE_COMPLETE, 0); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "removexattr " + "failed with %s", strerror (-ret)); + } + } + + loc_wipe (&loc); + return pump_command_reply (frame, this); +} + +static int +pump_complete_migration (xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + dict_t *dict = NULL; + pump_state_t state; + loc_t loc = {0}; + int dict_ret = 0; + int ret = -1; + + priv = this->private; + pump_priv = priv->pump_private; + + GF_ASSERT (priv->root_inode); + + afr_build_root_loc (this, &loc); + + dict = dict_new (); + + state = pump_get_state (); + if (state == PUMP_STATE_RUNNING) { + gf_log (this->name, GF_LOG_DEBUG, + "Pump finished pumping"); + + pump_priv->pump_finished = _gf_true; + + dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon"); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", + loc.path, PUMP_SOURCE_COMPLETE); + + ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setxattr failed - while notifying source complete"); + } + dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon"); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", + loc.path, PUMP_SINK_COMPLETE); + + ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setxattr failed - while notifying sink complete"); + } + + pump_save_path (this, "/"); + + } else if (state == PUMP_STATE_ABORT) { + gf_log (this->name, GF_LOG_DEBUG, "Starting cleanup " + "of pump internal xattrs"); + call_resume (pump_priv->cleaner); + } + + loc_wipe (&loc); + return 0; +} + +static int +pump_lookup_sink (loc_t *loc) +{ + xlator_t *this = NULL; + struct iatt iatt, parent; + dict_t *xattr_rsp; + dict_t *xattr_req = NULL; + int ret = 0; + + this = THIS; + + xattr_req = dict_new (); + + ret = afr_set_root_gfid (xattr_req); + if (ret) + goto out; + + ret = syncop_lookup (PUMP_SINK_CHILD (this), loc, + xattr_req, &iatt, &xattr_rsp, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Lookup on sink child failed"); + ret = -1; + goto out; + } + +out: + if (xattr_req) + dict_unref (xattr_req); + + return ret; +} + +static int +pump_task (void *data) +{ + xlator_t *this = NULL; + afr_private_t *priv = NULL; + + + loc_t loc = {0}; + struct iatt iatt, parent; + dict_t *xattr_rsp = NULL; + dict_t *xattr_req = NULL; + + int ret = -1; + + this = THIS; + priv = this->private; + + GF_ASSERT (priv->root_inode); + + afr_build_root_loc (this, &loc); + xattr_req = dict_new (); + if (!xattr_req) { + gf_log (this->name, GF_LOG_DEBUG, + "Out of memory"); + ret = -1; + goto out; + } + + afr_set_root_gfid (xattr_req); + ret = syncop_lookup (this, &loc, xattr_req, + &iatt, &xattr_rsp, &parent); + + gf_log (this->name, GF_LOG_TRACE, + "lookup: path=%s gfid=%s", + loc.path, uuid_utoa (loc.inode->gfid)); + + ret = pump_check_and_update_status (this); + if (ret < 0) { + goto out; + } + + pump_update_resume_path (this); + + afr_set_root_gfid (xattr_req); + ret = pump_lookup_sink (&loc); + if (ret) { + pump_update_resume_path (this); + goto out; + } + + gf_pump_traverse_directory (&loc); + + pump_complete_migration (this); +out: + if (xattr_req) + dict_unref (xattr_req); + + loc_wipe (&loc); + return 0; +} + + +static int +pump_task_completion (int ret, call_frame_t *sync_frame, void *data) +{ + xlator_t *this = NULL; + afr_private_t *priv = NULL; + + this = THIS; + + priv = this->private; + + inode_unref (priv->root_inode); + STACK_DESTROY (sync_frame->root); + + gf_log (this->name, GF_LOG_DEBUG, + "Pump xlator exiting"); + return 0; +} + +int +pump_start (call_frame_t *pump_frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + int ret = -1; + + priv = this->private; + pump_priv = priv->pump_private; + + afr_set_lk_owner (pump_frame, this, pump_frame->root); + pump_pid = (uint64_t) (unsigned long)pump_frame->root; + + ret = synctask_new (pump_priv->env, pump_task, + pump_task_completion, + pump_frame, NULL); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "starting pump failed"); + pump_change_state (this, PUMP_STATE_ABORT); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "setting pump as started lk_owner: %s %"PRIu64, + lkowner_utoa (&pump_frame->root->lk_owner), pump_pid); + + priv->use_afr_in_pump = 1; +out: + return ret; +} + +static int +pump_start_synctask (xlator_t *this) +{ + call_frame_t *frame = NULL; + int ret = 0; + + frame = create_frame (this, this->ctx->pool); + if (!frame) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + ret = -1; + goto out; + } + + pump_change_state (this, PUMP_STATE_RUNNING); + + ret = pump_start (frame, this); + +out: + return ret; +} + +int32_t +pump_cmd_start_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, dict_t *xdata) + +{ + call_frame_t *prev = NULL; + afr_local_t *local = NULL; + int ret = 0; + + local = frame->local; + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Could not initiate destination " + "brick connect"); + ret = op_ret; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Successfully initiated destination " + "brick connect"); + + pump_mark_start_pending (this); + + /* send the PARENT_UP as pump is ready now */ + prev = cookie; + if (prev && prev->this) + prev->this->notify (prev->this, GF_EVENT_PARENT_UP, this); + +out: + local->op_ret = ret; + pump_command_reply (frame, this); + + return 0; +} + +static int +pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *dict = NULL; + data_t *data = NULL; + char *clnt_cmd = NULL; + loc_t loc = {0}; + + int ret = 0; + + priv = this->private; + local = frame->local; + + GF_ASSERT (priv->root_inode); + + afr_build_root_loc (this, &loc); + + data = data_ref (dict_get (local->dict, RB_PUMP_CMD_START)); + if (!data) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "Could not get destination brick value"); + goto out; + } + + dict = dict_new (); + if (!dict) { + ret = -1; + goto out; + } + + clnt_cmd = GF_CALLOC (1, data->len+1, gf_common_mt_char); + if (!clnt_cmd) { + ret = -1; + goto out; + } + + memcpy (clnt_cmd, data->data, data->len); + clnt_cmd[data->len] = '\0'; + gf_log (this->name, GF_LOG_DEBUG, "Got destination brick %s\n", + clnt_cmd); + + ret = dict_set_dynstr (dict, CLIENT_CMD_CONNECT, clnt_cmd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Could not inititiate destination brick " + "connect"); + goto out; + } + + STACK_WIND (frame, + pump_cmd_start_setxattr_cbk, + PUMP_SINK_CHILD(this), + PUMP_SINK_CHILD(this)->fops->setxattr, + &loc, + dict, + 0, NULL); + + ret = 0; + +out: + if (dict) + dict_unref (dict); + + if (data) + data_unref (data); + + if (ret && clnt_cmd) + GF_FREE (clnt_cmd); + + loc_wipe (&loc); + return ret; +} + +static int +is_pump_aborted (xlator_t *this) +{ + pump_state_t state; + + state = pump_get_state (); + + return ((state == PUMP_STATE_ABORT)); +} + +int32_t +pump_cmd_start_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + char *path = NULL; + + pump_state_t state; + int ret = 0; + int need_unwind = 0; + int dict_ret = -1; + + local = frame->local; + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "getxattr failed - changing pump " + "state to RUNNING with '/'"); + path = "/"; + ret = op_ret; + } else { + gf_log (this->name, GF_LOG_TRACE, + "getxattr succeeded"); + + dict_ret = dict_get_str (dict, PUMP_PATH, &path); + if (dict_ret < 0) + path = "/"; + } + + state = pump_get_state (); + if ((state == PUMP_STATE_RUNNING) || + (state == PUMP_STATE_RESUME)) { + gf_log (this->name, GF_LOG_ERROR, + "Pump is already started"); + ret = -1; + goto out; + } + + pump_set_resume_path (this, path); + + if (is_pump_aborted (this)) + /* We're re-starting pump afresh */ + ret = pump_initiate_sink_connect (frame, this); + else { + /* We're re-starting pump from a previous + pause */ + gf_log (this->name, GF_LOG_DEBUG, + "about to start synctask"); + ret = pump_start_synctask (this); + need_unwind = 1; + } + +out: + if ((ret < 0) || (need_unwind == 1)) { + local->op_ret = ret; + pump_command_reply (frame, this); + } + return 0; +} + +int +pump_execute_status (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + + uint64_t number_files = 0; + + char filename[PATH_MAX]; + char summary[PATH_MAX+256]; + char *dict_str = NULL; + + int32_t op_ret = 0; + int32_t op_errno = 0; + + dict_t *dict = NULL; + int ret = -1; + + priv = this->private; + pump_priv = priv->pump_private; + + LOCK (&pump_priv->resume_path_lock); + { + number_files = pump_priv->number_files_pumped; + strncpy (filename, pump_priv->current_file, PATH_MAX); + } + UNLOCK (&pump_priv->resume_path_lock); + + dict_str = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char); + if (!dict_str) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + if (pump_priv->pump_finished) { + snprintf (summary, PATH_MAX+256, + "no_of_files=%"PRIu64, number_files); + } else { + snprintf (summary, PATH_MAX+256, + "no_of_files=%"PRIu64":current_file=%s", + number_files, filename); + } + snprintf (dict_str, PATH_MAX+256, "status=%d:%s", + (pump_priv->pump_finished)?1:0, summary); + + dict = dict_new (); + + ret = dict_set_dynstr (dict, RB_PUMP_CMD_STATUS, dict_str); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_dynstr returned negative value"); + } else { + dict_str = NULL; + } + + op_ret = 0; + +out: + + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); + + if (dict) + dict_unref (dict); + + GF_FREE (dict_str); + + return 0; +} + +int +pump_execute_pause (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + pump_change_state (this, PUMP_STATE_PAUSE); + + local->op_ret = 0; + pump_command_reply (frame, this); + + return 0; +} + +int +pump_execute_start (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = 0; + loc_t loc = {0}; + + priv = this->private; + local = frame->local; + + if (!priv->root_inode) { + gf_log (this->name, GF_LOG_ERROR, + "Pump xlator cannot be started without an initial " + "lookup"); + ret = -1; + goto out; + } + + GF_ASSERT (priv->root_inode); + + afr_build_root_loc (this, &loc); + + STACK_WIND (frame, + pump_cmd_start_getxattr_cbk, + PUMP_SOURCE_CHILD(this), + PUMP_SOURCE_CHILD(this)->fops->getxattr, + &loc, + PUMP_PATH, NULL); + + ret = 0; + +out: + if (ret < 0) { + local->op_ret = ret; + pump_command_reply (frame, this); + } + + loc_wipe (&loc); + return 0; +} + +static int +pump_cleanup_helper (void *data) { + call_frame_t *frame = data; + + pump_xattr_cleaner (frame, 0, frame->this, 0, 0, NULL); + + return 0; +} + +static int +pump_cleanup_done (int ret, call_frame_t *sync_frame, void *data) +{ + STACK_DESTROY (sync_frame->root); + + return 0; +} + +int +pump_execute_commit (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + afr_local_t *local = NULL; + call_frame_t *sync_frame = NULL; + int ret = 0; + + priv = this->private; + pump_priv = priv->pump_private; + local = frame->local; + + local->op_ret = 0; + if (pump_priv->pump_finished) { + pump_change_state (this, PUMP_STATE_COMMIT); + sync_frame = create_frame (this, this->ctx->pool); + ret = synctask_new (pump_priv->env, pump_cleanup_helper, + pump_cleanup_done, sync_frame, frame); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Couldn't create " + "synctask for cleaning up xattrs."); + } + + } else { + gf_log (this->name, GF_LOG_ERROR, "Commit can't proceed. " + "Migration in progress"); + local->op_ret = -1; + local->op_errno = EINPROGRESS; + pump_command_reply (frame, this); + } + + return 0; +} +int +pump_execute_abort (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + pump_private_t *pump_priv = NULL; + afr_local_t *local = NULL; + call_frame_t *sync_frame = NULL; + int ret = 0; + + priv = this->private; + pump_priv = priv->pump_private; + local = frame->local; + + pump_change_state (this, PUMP_STATE_ABORT); + + LOCK (&pump_priv->resume_path_lock); + { + pump_priv->number_files_pumped = 0; + pump_priv->current_file[0] = '\0'; + } + UNLOCK (&pump_priv->resume_path_lock); + + local->op_ret = 0; + if (pump_priv->pump_finished) { + sync_frame = create_frame (this, this->ctx->pool); + ret = synctask_new (pump_priv->env, pump_cleanup_helper, + pump_cleanup_done, sync_frame, frame); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Couldn't create " + "synctask for cleaning up xattrs."); + } + + } else { + pump_priv->cleaner = fop_setxattr_cbk_stub (frame, + pump_xattr_cleaner, + 0, 0, NULL); + } + + return 0; +} + +gf_boolean_t +pump_command_status (xlator_t *this, dict_t *dict) +{ + char *cmd = NULL; + int dict_ret = -1; + int ret = _gf_true; + + dict_ret = dict_get_str (dict, RB_PUMP_CMD_STATUS, &cmd); + if (dict_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Not a pump status command"); + ret = _gf_false; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Hit a pump command - status"); + ret = _gf_true; + +out: + return ret; + +} + +gf_boolean_t +pump_command_pause (xlator_t *this, dict_t *dict) +{ + char *cmd = NULL; + int dict_ret = -1; + int ret = _gf_true; + + dict_ret = dict_get_str (dict, RB_PUMP_CMD_PAUSE, &cmd); + if (dict_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Not a pump pause command"); + ret = _gf_false; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Hit a pump command - pause"); + ret = _gf_true; + +out: + return ret; + +} + +gf_boolean_t +pump_command_commit (xlator_t *this, dict_t *dict) +{ + char *cmd = NULL; + int dict_ret = -1; + int ret = _gf_true; + + dict_ret = dict_get_str (dict, RB_PUMP_CMD_COMMIT, &cmd); + if (dict_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Not a pump commit command"); + ret = _gf_false; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Hit a pump command - commit"); + ret = _gf_true; + +out: + return ret; + +} + +gf_boolean_t +pump_command_abort (xlator_t *this, dict_t *dict) +{ + char *cmd = NULL; + int dict_ret = -1; + int ret = _gf_true; + + dict_ret = dict_get_str (dict, RB_PUMP_CMD_ABORT, &cmd); + if (dict_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Not a pump abort command"); + ret = _gf_false; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Hit a pump command - abort"); + ret = _gf_true; + +out: + return ret; + +} + +gf_boolean_t +pump_command_start (xlator_t *this, dict_t *dict) +{ + char *cmd = NULL; + int dict_ret = -1; + int ret = _gf_true; + + dict_ret = dict_get_str (dict, RB_PUMP_CMD_START, &cmd); + if (dict_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Not a pump start command"); + ret = _gf_false; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Hit a pump command - start"); + ret = _gf_true; + +out: + return ret; + +} + +int +pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int op_errno = 0; + int ret = 0; + + priv = this->private; + + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_getxattr_cbk, + FIRST_CHILD (this), + (FIRST_CHILD (this))->fops->getxattr, + loc, name, xdata); + return 0; + } + + if (name) { + if (!strncmp (name, AFR_XATTR_PREFIX, + strlen (AFR_XATTR_PREFIX))) { + + op_errno = ENODATA; + goto out; + } + + if (!strcmp (name, RB_PUMP_CMD_STATUS)) { + gf_log (this->name, GF_LOG_DEBUG, + "Hit pump command - status"); + pump_execute_status (frame, this); + ret = 0; + goto out; + } + } + + afr_getxattr (frame, this, loc, name, xdata); + + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +pump_command_reply (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) + gf_log (this->name, GF_LOG_INFO, + "Command failed"); + else + gf_log (this->name, GF_LOG_INFO, + "Command succeeded"); + + AFR_STACK_UNWIND (setxattr, + frame, + local->op_ret, + local->op_errno, NULL); + + return 0; +} + +int +pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict, + int *op_errno_p) +{ + afr_local_t *local = NULL; + int ret = -1; + int op_errno = 0; + + if (pump_command_start (this, dict)) { + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->dict = dict_ref (dict); + ret = pump_execute_start (frame, this); + + } else if (pump_command_pause (this, dict)) { + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->dict = dict_ref (dict); + ret = pump_execute_pause (frame, this); + + } else if (pump_command_abort (this, dict)) { + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->dict = dict_ref (dict); + ret = pump_execute_abort (frame, this); + + } else if (pump_command_commit (this, dict)) { + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->dict = dict_ref (dict); + ret = pump_execute_commit (frame, this); + } +out: + if (op_errno_p) + *op_errno_p = op_errno; + return ret; +} + +int +pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int ret = -1; + int op_errno = 0; + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out); + + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_setxattr_cbk, + FIRST_CHILD (this), + (FIRST_CHILD (this))->fops->setxattr, + loc, dict, flags, xdata); + return 0; + } + + ret = pump_parse_command (frame, this, dict, &op_errno); + if (ret >= 0) + goto out; + + afr_setxattr (frame, this, loc, dict, flags, xdata); + + ret = 0; +out: + if (ret < 0) { + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + } + + return 0; +} + +/* Defaults */ +static int32_t +pump_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, + xattr_req); + return 0; + } + + afr_lookup (frame, this, loc, xattr_req); + return 0; +} + + +static int32_t +pump_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset, xdata); + return 0; + } + + afr_truncate (frame, this, loc, offset, xdata); + return 0; +} + + +static int32_t +pump_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset, xdata); + return 0; + } + + afr_ftruncate (frame, this, fd, offset, xdata); + return 0; +} + + + + +int +pump_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; + } + afr_mknod (frame, this, loc, mode, rdev, umask, xdata); + return 0; + +} + + + +int +pump_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_mkdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; + } + afr_mkdir (frame, this, loc, mode, umask, xdata); + return 0; + +} + + +static int32_t +pump_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, int xflag, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + return 0; + } + afr_unlink (frame, this, loc, xflag, xdata); + return 0; + +} + + +static int +pump_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, dict_t *xdata) +{ + afr_private_t *priv = NULL; + + priv = this->private; + + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_rmdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + loc, flags, xdata); + return 0; + } + + afr_rmdir (frame, this, loc, flags, xdata); + return 0; + +} + + + +int +pump_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, loc, umask, xdata); + return 0; + } + afr_symlink (frame, this, linkpath, loc, umask, xdata); + return 0; + +} + + +static int32_t +pump_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_rename_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + return 0; + } + afr_rename (frame, this, oldloc, newloc, xdata); + return 0; + +} + + +static int32_t +pump_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_link_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; + } + afr_link (frame, this, oldloc, newloc, xdata); + return 0; + +} + + +static int32_t +pump_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; + } + afr_create (frame, this, loc, flags, mode, umask, fd, xdata); + return 0; + +} + + +static int32_t +pump_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, flags, fd, xdata); + return 0; + } + afr_open (frame, this, loc, flags, fd, xdata); + return 0; + +} + + +static int32_t +pump_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, + vector, + count, + off, flags, + iobref, xdata); + return 0; + } + + afr_writev (frame, this, fd, vector, count, off, flags, iobref, xdata); + return 0; +} + + +static int32_t +pump_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_flush_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd, xdata); + return 0; + } + afr_flush (frame, this, fd, xdata); + return 0; + +} + + +static int32_t +pump_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, + flags, xdata); + return 0; + } + afr_fsync (frame, this, fd, flags, xdata); + return 0; + +} + + +static int32_t +pump_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_opendir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, + loc, fd, xdata); + return 0; + } + afr_opendir (frame, this, loc, fd, xdata); + return 0; + +} + + +static int32_t +pump_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_fsyncdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, + fd, + flags, xdata); + return 0; + } + afr_fsyncdir (frame, this, fd, flags, xdata); + return 0; + +} + + +static int32_t +pump_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_xattrop_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, + loc, + flags, + dict, xdata); + return 0; + } + afr_xattrop (frame, this, loc, flags, dict, xdata); + return 0; + +} + +static int32_t +pump_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_fxattrop_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, + fd, + flags, + dict, xdata); + return 0; + } + afr_fxattrop (frame, this, fd, flags, dict, xdata); + return 0; + +} + + +static int32_t +pump_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.pump*", + name, op_errno, out); + + op_errno = 0; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, + name, xdata); + return 0; + } + afr_removexattr (frame, this, loc, name, xdata); + + out: + if (op_errno) + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + return 0; + +} + + + +static int32_t +pump_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t off, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_readdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, + fd, size, off, xdata); + return 0; + } + afr_readdir (frame, this, fd, size, off, xdata); + return 0; + +} + + +static int32_t +pump_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *dict) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_readdirp_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, + fd, size, off, dict); + return 0; + } + afr_readdirp (frame, this, fd, size, off, dict); + return 0; + +} + + + +static int32_t +pump_releasedir (xlator_t *this, + fd_t *fd) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (priv->use_afr_in_pump) + afr_releasedir (this, fd); + return 0; + +} + +static int32_t +pump_release (xlator_t *this, + fd_t *fd) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (priv->use_afr_in_pump) + afr_release (this, fd); + return 0; + +} + +static int32_t +pump_forget (xlator_t *this, inode_t *inode) +{ + afr_private_t *priv = NULL; + + priv = this->private; + if (priv->use_afr_in_pump) + afr_forget (this, inode); + + return 0; +} + +static int32_t +pump_setattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_setattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; + } + afr_setattr (frame, this, loc, stbuf, valid, xdata); + return 0; + +} + + +static int32_t +pump_fsetattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + afr_private_t *priv = NULL; + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, + default_fsetattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; + } + afr_fsetattr (frame, this, fd, stbuf, valid, xdata); + return 0; + +} + + +/* End of defaults */ + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); + + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + +static int +is_xlator_pump_sink (xlator_t *child) +{ + return (child == PUMP_SINK_CHILD(THIS)); +} + +static int +is_xlator_pump_source (xlator_t *child) +{ + return (child == PUMP_SOURCE_CHILD(THIS)); +} + +int32_t +notify (xlator_t *this, int32_t event, + void *data, ...) +{ + int ret = -1; + xlator_t *child_xl = NULL; + + child_xl = (xlator_t *) data; + + ret = afr_notify (this, event, data, NULL); + + switch (event) { + case GF_EVENT_CHILD_DOWN: + if (is_xlator_pump_source (child_xl)) + pump_change_state (this, PUMP_STATE_ABORT); + break; + + case GF_EVENT_CHILD_UP: + if (is_xlator_pump_sink (child_xl)) + if (is_pump_start_pending (this)) { + gf_log (this->name, GF_LOG_DEBUG, + "about to start synctask"); + ret = pump_start_synctask (this); + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, + "Could not start pump " + "synctask"); + else + pump_remove_start_pending (this); + } + } + + return ret; +} + +int32_t +init (xlator_t *this) +{ + afr_private_t * priv = NULL; + pump_private_t *pump_priv = NULL; + int child_count = 0; + xlator_list_t * trav = NULL; + int i = 0; + int ret = -1; + GF_UNUSED int op_errno = 0; + + int source_child = 0; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "pump translator needs a source and sink" + "subvolumes defined."); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Volume is dangling."); + } + + priv = GF_CALLOC (1, sizeof (afr_private_t), gf_afr_mt_afr_private_t); + if (!priv) + goto out; + + LOCK_INIT (&priv->lock); + + child_count = xlator_subvolume_count (this); + if (child_count != 2) { + gf_log (this->name, GF_LOG_ERROR, + "There should be exactly 2 children - one source " + "and one sink"); + return -1; + } + priv->child_count = child_count; + + priv->read_child = source_child; + priv->favorite_child = source_child; + priv->background_self_heal_count = 0; + + priv->data_self_heal = "on"; + priv->metadata_self_heal = 1; + priv->entry_self_heal = 1; + + priv->data_self_heal_window_size = 16; + + priv->data_change_log = 1; + priv->metadata_change_log = 1; + priv->entry_change_log = 1; + priv->use_afr_in_pump = 1; + priv->sh_readdir_size = 65536; + + /* Locking options */ + + /* Lock server count infact does not matter. Locks are held + on all subvolumes, in this case being the source + and the sink. + */ + + priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, + gf_afr_mt_char); + if (!priv->child_up) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory."); + op_errno = ENOMEM; + goto out; + } + + priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, + gf_afr_mt_xlator_t); + if (!priv->children) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory."); + op_errno = ENOMEM; + goto out; + } + + priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), + child_count, + gf_afr_mt_char); + if (!priv->pending_key) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory."); + op_errno = ENOMEM; + goto out; + } + + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; + + ret = gf_asprintf (&priv->pending_key[i], "%s.%s", + AFR_XATTR_PREFIX, + trav->xlator->name); + if (-1 == ret) { + gf_log (this->name, GF_LOG_ERROR, + "asprintf failed to set pending key"); + op_errno = ENOMEM; + goto out; + } + + trav = trav->next; + i++; + } + + ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + + priv->root_inode = NULL; + + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), + gf_afr_mt_int32_t); + if (!priv->last_event) { + ret = -ENOMEM; + goto out; + } + + pump_priv = GF_CALLOC (1, sizeof (*pump_priv), + gf_afr_mt_pump_priv); + if (!pump_priv) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + op_errno = ENOMEM; + goto out; + } + + LOCK_INIT (&pump_priv->resume_path_lock); + LOCK_INIT (&pump_priv->pump_state_lock); + + pump_priv->resume_path = GF_CALLOC (1, PATH_MAX, + gf_afr_mt_char); + if (!pump_priv->resume_path) { + gf_log (this->name, GF_LOG_ERROR, "Out of memory"); + ret = -1; + goto out; + } + + pump_priv->env = this->ctx->env; + if (!pump_priv->env) { + gf_log (this->name, GF_LOG_ERROR, + "Could not create new sync-environment"); + ret = -1; + goto out; + } + + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 128); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + priv->pump_private = pump_priv; + pump_priv = NULL; + + this->private = priv; + priv = NULL; + + pump_change_state (this, PUMP_STATE_ABORT); + + ret = 0; +out: + + if (pump_priv) { + GF_FREE (pump_priv->resume_path); + LOCK_DESTROY (&pump_priv->resume_path_lock); + LOCK_DESTROY (&pump_priv->pump_state_lock); + GF_FREE (pump_priv); + } + + if (priv) { + GF_FREE (priv->child_up); + GF_FREE (priv->children); + GF_FREE (priv->pending_key); + GF_FREE (priv->last_event); + LOCK_DESTROY (&priv->lock); + GF_FREE (priv); + } + + return ret; +} + +int +fini (xlator_t *this) +{ + afr_private_t * priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + this->private = NULL; + if (!priv) + goto out; + + pump_priv = priv->pump_private; + if (!pump_priv) + goto afr_priv; + + GF_FREE (pump_priv->resume_path); + LOCK_DESTROY (&pump_priv->resume_path_lock); + LOCK_DESTROY (&pump_priv->pump_state_lock); + GF_FREE (pump_priv); +afr_priv: + afr_priv_destroy (priv); +out: + return 0; +} + + +struct xlator_fops fops = { + .lookup = pump_lookup, + .open = pump_open, + .flush = pump_flush, + .fsync = pump_fsync, + .fsyncdir = pump_fsyncdir, + .xattrop = pump_xattrop, + .fxattrop = pump_fxattrop, + .getxattr = pump_getxattr, + + /* inode write */ + .writev = pump_writev, + .truncate = pump_truncate, + .ftruncate = pump_ftruncate, + .setxattr = pump_setxattr, + .setattr = pump_setattr, + .fsetattr = pump_fsetattr, + .removexattr = pump_removexattr, + + /* dir read */ + .opendir = pump_opendir, + .readdir = pump_readdir, + .readdirp = pump_readdirp, + + /* dir write */ + .create = pump_create, + .mknod = pump_mknod, + .mkdir = pump_mkdir, + .unlink = pump_unlink, + .rmdir = pump_rmdir, + .link = pump_link, + .symlink = pump_symlink, + .rename = pump_rename, +}; + +struct xlator_dumpops dumpops = { + .priv = afr_priv_dump, +}; + + +struct xlator_cbks cbks = { + .release = pump_release, + .releasedir = pump_releasedir, + .forget = pump_forget, +}; + +struct volume_options options[] = { + { .key = {NULL} }, +}; diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h new file mode 100644 index 000000000..9d0b6db6a --- /dev/null +++ b/xlators/cluster/afr/src/pump.h @@ -0,0 +1,81 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __PUMP_H__ +#define __PUMP_H__ + +#include "syncop.h" + +/* FIXME: Needs to be defined in a common file */ +#define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect" +#define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect" + +#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete" +#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete" + +#define PUMP_PATH "trusted.glusterfs.pump-path" + +#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator) +#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator) + +typedef enum { + PUMP_STATE_RUNNING, /* Pump is running and migrating files */ + PUMP_STATE_RESUME, /* Pump is resuming from a previous pause */ + PUMP_STATE_PAUSE, /* Pump is paused */ + PUMP_STATE_ABORT, /* Pump is aborted */ + PUMP_STATE_COMMIT, /* Pump is commited */ +} pump_state_t; + +typedef struct _pump_private { + struct syncenv *env; /* The env pointer to the pump synctask */ + char *resume_path; /* path to resume from the last pause */ + gf_lock_t resume_path_lock; /* Synchronize resume_path changes */ + gf_lock_t pump_state_lock; /* Synchronize pump_state changes */ + pump_state_t pump_state; /* State of pump */ + char current_file[PATH_MAX]; /* Current file being pumped */ + uint64_t number_files_pumped; /* Number of files pumped */ + gf_boolean_t pump_finished; /* Boolean to indicate pump termination */ + char pump_start_pending; /* Boolean to mark start pending until + CHILD_UP */ + call_stub_t *cleaner; +} pump_private_t; + +void +build_root_loc (inode_t *inode, loc_t *loc); +int pump_start (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_start (xlator_t *this, dict_t *dict); + +int +pump_execute_start (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_pause (xlator_t *this, dict_t *dict); + +int +pump_execute_pause (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_abort (xlator_t *this, dict_t *dict); + +int +pump_execute_abort (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_status (xlator_t *this, dict_t *dict); + +int +pump_execute_status (call_frame_t *frame, xlator_t *this); + +int +pump_command_reply (call_frame_t *frame, xlator_t *this); + +#endif /* __PUMP_H__ */ diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index f87212699..3fc29bf81 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -1,30 +1,37 @@ - -xlator_LTLIBRARIES = dht.la nufa.la +xlator_LTLIBRARIES = dht.la nufa.la switch.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster +dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ + dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ + dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \ - dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c - -dht_la_SOURCES = $(dht_common_source) dht.c +dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c +switch_la_SOURCES = $(dht_common_source) switch.c -dht_la_LDFLAGS = -module -avoidversion +dht_la_LDFLAGS = -module -avoid-version dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LDFLAGS = -module -avoid-version nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = dht-common.h dht-common.c +switch_la_LDFLAGS = -module -avoid-version +switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = dht-common.h dht-mem-types.h \ + $(top_builddir)/xlators/lib/src/libxlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/xlators/lib/src -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CFLAGS = -Wall $(GF_CFLAGS) -CLEANFILES = +CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/distribute.so install-data-hook: - ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so
\ No newline at end of file + ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 62f3822a5..3868fc38f 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2009-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -27,10 +18,87 @@ #include "glusterfs.h" #include "xlator.h" +#include "libxlator.h" #include "dht-common.h" #include "defaults.h" +#include "byte-order.h" +#include "glusterfs-acl.h" #include <sys/time.h> +#include <libgen.h> + +int +dht_aggregate (dict_t *this, char *key, data_t *value, void *data) +{ + dict_t *dst = NULL; + int64_t *ptr = 0, *size = NULL; + int32_t ret = -1; + data_t *dict_data = NULL; + + dst = data; + + if (strcmp (key, GF_XATTR_QUOTA_SIZE_KEY) == 0) { + ret = dict_get_bin (dst, key, (void **)&size); + if (ret < 0) { + size = GF_CALLOC (1, sizeof (int64_t), + gf_common_mt_char); + if (size == NULL) { + gf_log ("dht", GF_LOG_WARNING, + "memory allocation failed"); + return -1; + } + ret = dict_set_bin (dst, key, size, sizeof (int64_t)); + if (ret < 0) { + gf_log ("dht", GF_LOG_WARNING, + "dht aggregate dict set failed"); + GF_FREE (size); + return -1; + } + } + + ptr = data_to_bin (value); + if (ptr == NULL) { + gf_log ("dht", GF_LOG_WARNING, "data to bin failed"); + return -1; + } + + *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + + } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime (THIS, dst, key, value); + if (ret < 0) + return ret; + } else { + /* compare user xattrs only */ + if (!strncmp (key, "user.", strlen ("user."))) { + ret = dict_lookup (dst, key, &dict_data); + if (!ret && dict_data && value) { + ret = is_data_equal (dict_data, value); + if (!ret) + gf_log ("dht", GF_LOG_DEBUG, + "xattr mismatch for %s", key); + } + } + ret = dict_set (dst, key, value); + if (ret) + gf_log ("dht", GF_LOG_WARNING, "xattr dict set failed"); + } + + return 0; +} + + +void +dht_aggregate_xattr (dict_t *dst, dict_t *src) +{ + if ((dst == NULL) || (src == NULL)) { + goto out; + } + + dict_foreach (src, dht_aggregate, dst); +out: + return; +} /* TODO: - use volumename in xattr instead of "dht" @@ -42,100 +110,369 @@ int dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int op_ret, int op_errno) + xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int ret = 0; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int ret = -1; - local = frame->local; - ret = op_ret; - - if (ret == 0) { - layout = local->selfheal.layout; - ret = inode_ctx_put (local->inode, this, - (uint64_t)(long)layout); - - if (ret == 0) - local->selfheal.layout = NULL; - - if (local->st_ino) { - local->stbuf.st_ino = local->st_ino; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "could not find hashed subvolume for %s", - local->loc.path); - } - } + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - DHT_STACK_UNWIND (frame, ret, local->op_errno, local->inode, - &local->stbuf, local->xattr); + local = frame->local; + ret = op_ret; - return 0; + FRAME_SU_UNDO (frame, dht_local_t); + + if (ret == 0) { + layout = local->selfheal.layout; + ret = dht_layout_set (this, local->inode, layout); + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + + DHT_STACK_UNWIND (lookup, frame, ret, local->op_errno, local->inode, + &local->stbuf, local->xattr, &local->postparent); + +out: + return ret; +} + + +int +dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) +{ + dht_local_t *local = NULL; + call_frame_t *main_frame = NULL; + int op_errno = 0; + int ret = -1; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + + local = discover_frame->local; + layout = local->layout; + conf = this->private; + + LOCK(&discover_frame->lock); + { + main_frame = local->main_frame; + local->main_frame = NULL; + } + UNLOCK(&discover_frame->lock); + + if (!main_frame) + return 0; + + if (local->file_count && local->dir_count) { + gf_log (this->name, GF_LOG_ERROR, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + op_errno = EIO; + goto out; + } + + if (local->cached_subvol) { + ret = dht_layout_preset (this, local->cached_subvol, + local->inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set layout for subvolume %s", + local->cached_subvol ? local->cached_subvol->name : "<nil>"); + op_errno = EINVAL; + goto out; + } + } else { + ret = dht_layout_normalize (this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ + gf_log (this->name, GF_LOG_DEBUG, + "normalizing failed on %s " + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", local->loc.path, + (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); + if ((ret > 0) && (ret == conf->subvolume_cnt)) { + op_errno = ESTALE; + goto out; + } + } + + if (local->inode) + dht_layout_set (this, local->inode, layout); + } + + DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; +out: + DHT_STACK_UNWIND (lookup, main_frame, -1, op_errno, NULL, NULL, NULL, + NULL); + + return ret; +} + + +int +dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + int attempt_unwind = 0; + dht_conf_t *conf = 0; + + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", this->private, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + + /* Check if the gfid is different for file from other node */ + if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid different on %s", + local->loc.path, prev->this->name); + } + + + LOCK (&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, xattr); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "lookup of %s on %s returned error (%s)", + local->loc.path, prev->this->name, + strerror (op_errno)); + + goto unlock; + } + + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (is_dir) { + local->dir_count ++; + } else { + local->file_count ++; + + if (!is_linkfile) { + /* real file */ + local->cached_subvol = prev->this; + attempt_unwind = 1; + } else { + goto unlock; + } + } + + local->op_ret = 0; + + if (local->xattr == NULL) { + local->xattr = dict_ref (xattr); + } else { + dht_aggregate_xattr (local->xattr, xattr); + } + + if (local->inode == NULL) + local->inode = inode_ref (inode); + + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + } +unlock: + UNLOCK (&frame->lock); +out: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt) || attempt_unwind) { + dht_discover_complete (this, frame); + } + + if (is_last_call (this_call_cnt)) + DHT_STACK_DESTROY (frame); + + return 0; +} + + +int +dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int op_errno = EINVAL; + int i = 0; + call_frame_t *discover_frame = NULL; + + conf = this->private; + local = frame->local; + + ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + loc->path, conf->xattr_name); + + ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + loc->path, conf->link_xattr_name); + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + uuid_copy (local->gfid, loc->gfid); + + discover_frame = copy_frame (frame); + if (!discover_frame) { + op_errno = ENOMEM; + goto err; + } + + discover_frame->local = local; + frame->local = NULL; + local->main_frame = frame; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (discover_frame, dht_discover_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +err: + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); + + return 0; } int dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = 0; - int is_dir = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", this->private, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); - conf = this->private; local = frame->local; prev = cookie; - layout = local->layout; + layout = local->layout; + + if (!op_ret && uuid_is_null (local->gfid)) + memcpy (local->gfid, stbuf->ia_gfid, 16); + + /* Check if the gfid is different for file from other node */ + if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid different on %s", + local->loc.path, prev->this->name); + } LOCK (&frame->lock); { /* TODO: assert equal mode on stbuf->st_mode and - local->stbuf->st_mode + local->stbuf->st_mode - else mkdir/chmod/chown and fix - */ - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, xattr); + else mkdir/chmod/chown and fix + */ + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, xattr); - if (op_ret == -1) { - local->op_errno = ENOENT; - gf_log (this->name, GF_LOG_DEBUG, - "lookup of %s on %s returned error (%s)", - local->loc.path, prev->this->name, - strerror (op_errno)); + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "lookup of %s on %s returned error (%s)", + local->loc.path, prev->this->name, + strerror (op_errno)); - goto unlock; - } + goto unlock; + } - is_dir = check_is_dir (inode, stbuf, xattr); - if (!is_dir) { + is_dir = check_is_dir (inode, stbuf, xattr); + if (!is_dir) { gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned non dir 0%o", local->loc.path, prev->this->name, - stbuf->st_mode); + stbuf->ia_type); local->need_selfheal = 1; - goto unlock; + goto unlock; } - local->op_ret = 0; - if (local->xattr == NULL) - local->xattr = dict_ref (xattr); - if (local->inode == NULL) - local->inode = inode_ref (inode); - - dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + local->op_ret = 0; + if (local->xattr == NULL) { + local->xattr = dict_ref (xattr); + } else { + dht_aggregate_xattr (local->xattr, xattr); + } - if (prev->this == local->hashed_subvol) - local->st_ino = local->stbuf.st_ino; + if (local->inode == NULL) + local->inode = inode_ref (inode); + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); } unlock: UNLOCK (&frame->lock); @@ -150,176 +487,270 @@ unlock: return 0; } - if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout); + if (local->op_ret == 0) { + ret = dht_layout_normalize (this, &local->loc, layout); - local->layout = NULL; + if (ret != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "fixing assignment on %s", + local->loc.path); + goto selfheal; + } - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "fixing assignment on %s", - local->loc.path); - goto selfheal; - } - - inode_ctx_put (local->inode, this, - (uint64_t)(long)layout); - - if (local->st_ino) { - local->stbuf.st_ino = local->st_ino; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "could not find hashed subvol for %s", - local->loc.path); - } - } + dht_layout_set (this, local->inode, layout); + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); } - return 0; + return 0; selfheal: - ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, - &local->loc, layout); - - return 0; + FRAME_SU_DO (frame, dht_local_t); + uuid_copy (local->loc.gfid, local->gfid); + ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, + &local->loc, layout); +out: + return ret; } int dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int is_dir = 0; - int is_linkfile = 0; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + call_frame_t *copy = NULL; + dht_local_t *copy_local = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, err); + GF_VALIDATE_OR_GOTO ("dht", frame->local, err); + GF_VALIDATE_OR_GOTO ("dht", cookie, err); local = frame->local; prev = cookie; - conf = this->private; + conf = this->private; + if (!conf) + goto out; LOCK (&frame->lock); { - if (op_ret == -1) { - local->op_errno = op_errno; + if (op_ret == -1) { + local->op_errno = op_errno; - if ((op_errno != ENOTCONN) + if ((op_errno != ENOTCONN) && (op_errno != ENOENT) && (op_errno != ESTALE)) { - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); + gf_log (this->name, GF_LOG_INFO, + "subvolume %s for %s returned -1 (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); } - if (op_errno == ESTALE) { - /* propogate the ESTALE to parent. - * setting local->layout_mismatch would send + /* propagate the ESTALE to parent. + * setting local->return_estale would send * ESTALE to parent. */ - local->layout_mismatch = 1; + local->return_estale = 1; } - goto unlock; - } + /* if it is ENOENT, we may have to do a + * 'lookup_everywhere()' to make sure + * the file is not migrated */ + if (op_errno == ENOENT) { + if (IA_ISREG (local->loc.inode->ia_type)) { + local->need_lookup_everywhere = 1; + } + } + goto unlock; + } - if (S_IFMT & (stbuf->st_mode ^ local->inode->st_mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "mismatching filetypes 0%o v/s 0%o for %s", - (stbuf->st_mode & S_IFMT), - (local->inode->st_mode & S_IFMT), - local->loc.path); + if (stbuf->ia_type != local->inode->ia_type) { + gf_log (this->name, GF_LOG_INFO, + "mismatching filetypes 0%o v/s 0%o for %s", + (stbuf->ia_type), (local->inode->ia_type), + local->loc.path); - local->op_ret = -1; - local->op_errno = EINVAL; + local->op_ret = -1; + local->op_errno = EINVAL; - goto unlock; - } + goto unlock; + } - layout = dht_layout_get (this, inode); - - is_dir = check_is_dir (inode, stbuf, xattr); - is_linkfile = check_is_linkfile (inode, stbuf, xattr); - - if (is_linkfile) { - gf_log (this->name, GF_LOG_DEBUG, - "linkfile found in revalidate for %s", - local->loc.path); - local->layout_mismatch = 1; - - goto unlock; - } + layout = local->layout; - if (is_dir) { - ret = dht_layout_dir_mismatch (this, layout, - prev->this, &local->loc, - xattr); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "mismatching layouts for %s", - local->loc.path); - - local->layout_mismatch = 1; - - goto unlock; - } - } - - dht_stat_merge (this, &local->stbuf, stbuf, prev->this); - - local->op_ret = 0; - local->stbuf.st_ino = local->st_ino; - - if (!local->xattr) - local->xattr = dict_ref (xattr); - } -unlock: - UNLOCK (&frame->lock); + is_dir = check_is_dir (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + + if (is_linkfile) { + gf_log (this->name, GF_LOG_INFO, + "linkfile found in revalidate for %s", + local->loc.path); + local->return_estale = 1; + + goto unlock; + } + + if (is_dir) { + ret = dht_dir_has_layout (xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->stbuf.ia_ctime, + local->stbuf.ia_ctime_nsec, + stbuf->ia_ctime, + stbuf->ia_ctime_nsec)) { + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + } + } + if (local->stbuf.ia_type != IA_INVAL) + { + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid)) { + local->need_selfheal = 1; + } + } + ret = dht_layout_dir_mismatch (this, layout, + prev->this, &local->loc, + xattr); + if (ret != 0) { + gf_log (this->name, GF_LOG_INFO, + "mismatching layouts for %s", + local->loc.path); + + local->layout_mismatch = 1; + + goto unlock; + } + } + + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + local->op_ret = 0; + + if (!local->xattr) { + local->xattr = dict_ref (xattr); + } else if (is_dir) { + dht_aggregate_xattr (local->xattr, xattr); + } + } +unlock: + UNLOCK (&frame->lock); +out: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - if (!S_ISDIR (local->stbuf.st_mode) - && (local->hashed_subvol != local->cached_subvol) - && (local->stbuf.st_nlink == 1) - && (conf->unhashed_sticky_bit)) { - local->stbuf.st_mode |= S_ISVTX; - } - + if (!IA_ISDIR (local->stbuf.ia_type) + && (local->hashed_subvol != local->cached_subvol) + && (local->stbuf.ia_nlink == 1) + && (conf && conf->unhashed_sticky_bit)) { + local->stbuf.ia_prot.sticky = 1; + } + if (local->need_selfheal) { + local->need_selfheal = 0; + uuid_copy (local->gfid, local->stbuf.ia_gfid); + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + copy = create_frame (this, this->ctx->pool); + if (copy) { + copy_local = dht_local_init (copy, &local->loc, + NULL, 0); + if (!copy_local) + goto cont; + copy_local->stbuf = local->stbuf; + copy->local = copy_local; + FRAME_SU_DO (copy, dht_local_t); + ret = synctask_new (this->ctx->env, + dht_dir_attr_heal, + dht_dir_attr_heal_done, + copy, copy); + } + } +cont: if (local->layout_mismatch) { - local->op_ret = -1; - local->op_errno = ESTALE; - } - - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr); - } + /* Found layout mismatch in the directory, need to + fix this in the inode context */ + dht_layout_unref (this, local->layout); + local->layout = NULL; + dht_lookup_directory (frame, this, &local->loc); + return 0; + } - return 0; + if (local->need_lookup_everywhere) { + /* As the current layout gave ENOENT error, we would + need a new layout */ + dht_layout_unref (this, local->layout); + local->layout = NULL; + + /* We know that current cached subvol is no more + valid, get the new one */ + local->cached_subvol = NULL; + dht_lookup_everywhere (frame, this, &local->loc); + return 0; + } + if (local->return_estale) { + local->op_ret = -1; + local->op_errno = ESTALE; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + } + +err: + return ret; } int dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *stbuf) + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; - dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; int ret = -1; - local = frame->local; - cached_subvol = local->cached_subvol; - conf = this->private; + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", this->private, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); - ret = dht_layout_inode_set (this, local->cached_subvol, inode); + local = frame->local; + cached_subvol = local->cached_subvol; + conf = this->private; + + ret = dht_layout_preset (this, local->cached_subvol, local->loc.inode); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "failed to set layout for subvolume %s", @@ -329,65 +760,232 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, goto unwind; } - local->op_ret = 0; - if ((local->stbuf.st_nlink == 1) - && (conf->unhashed_sticky_bit)) { - local->stbuf.st_mode |= S_ISVTX; - } + local->op_ret = 0; + if ((local->stbuf.ia_nlink == 1) + && (conf && conf->unhashed_sticky_bit)) { + local->stbuf.ia_prot.sticky = 1; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } unwind: - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr); - return 0; + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); +out: + return ret; +} + + +int +dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + dht_local_t *local = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + hashed_subvol = local->hashed_subvol; + cached_subvol = local->cached_subvol; + + if (local->file_count && local->dir_count) { + gf_log (this->name, GF_LOG_ERROR, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL, + NULL); + return 0; + } + + if (local->dir_count) { + dht_lookup_directory (frame, this, &local->loc); + return 0; + } + + if (!cached_subvol) { + DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL, + NULL); + return 0; + } + + if (local->need_lookup_everywhere) { + if (uuid_compare (local->gfid, local->inode->gfid)) { + /* GFID different, return error */ + DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, + NULL, NULL, NULL); + return 0; + } + local->op_ret = 0; + local->op_errno = 0; + layout = dht_layout_for_subvol (this, cached_subvol); + if (!layout) { + gf_log (this->name, GF_LOG_INFO, + "%s: no pre-set layout for subvolume %s", + local->loc.path, (cached_subvol ? + cached_subvol->name : + "<nil>")); + } + + ret = dht_layout_set (this, local->inode, layout); + if (ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "%s: failed to set layout for subvol %s", + local->loc.path, (cached_subvol ? + cached_subvol->name : + "<nil>")); + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (lookup, frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, local->xattr, + &local->postparent); + return 0; + } + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_INFO, + "cannot create linkfile file for %s on %s: " + "hashed subvolume cannot be found.", + local->loc.path, cached_subvol->name); + + local->op_ret = 0; + local->op_errno = 0; + + ret = dht_layout_preset (frame->this, cached_subvol, + local->inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "failed to set layout for subvol %s", + cached_subvol ? cached_subvol->name : + "<nil>"); + local->op_ret = -1; + local->op_errno = EINVAL; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (lookup, frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, local->xattr, + &local->postparent); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "linking file %s existing on %s to %s (hash)", + local->loc.path, cached_subvol->name, + hashed_subvol->name); + + ret = dht_linkfile_create (frame, + dht_lookup_linkfile_create_cbk, this, + cached_subvol, hashed_subvol, &local->loc); + + return ret; +} + + +int +dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int this_call_cnt = 0; + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + dht_lookup_everywhere_done (frame, this); + } + + return 0; } int dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xattr, + struct iatt *postparent) { - dht_conf_t *conf = NULL; dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; - int is_linkfile = 0; - int is_dir = 0; - xlator_t *subvol = NULL; - loc_t *loc = NULL; - xlator_t *link_subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - int ret = -1; + int is_linkfile = 0; + int is_dir = 0; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + xlator_t *link_subvol = NULL; + int ret = -1; + int32_t fd_count = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + GF_VALIDATE_OR_GOTO ("dht", this->private, out); + + local = frame->local; + loc = &local->loc; + conf = this->private; + + prev = cookie; + subvol = prev->this; - conf = this->private; + LOCK (&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) + local->op_errno = op_errno; + goto unlock; + } - local = frame->local; - loc = &local->loc; + if (uuid_is_null (local->gfid)) + uuid_copy (local->gfid, buf->ia_gfid); - prev = cookie; - subvol = prev->this; + if (uuid_compare (local->gfid, buf->ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid differs on subvolume %s", + loc->path, prev->this->name); + } - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno != ENOENT) - local->op_errno = op_errno; - goto unlock; - } + is_linkfile = check_is_linkfile (inode, buf, xattr, + conf->link_xattr_name); + is_dir = check_is_dir (inode, buf, xattr); - is_linkfile = check_is_linkfile (inode, buf, xattr); - is_dir = check_is_dir (inode, buf, xattr); - - if (is_linkfile) { - link_subvol = dht_linkfile_subvol (this, inode, buf, - xattr); - gf_log (this->name, GF_LOG_DEBUG, - "found on %s linkfile %s (-> %s)", - subvol->name, loc->path, - link_subvol ? link_subvol->name : "''"); - goto unlock; - } + if (is_linkfile) { + link_subvol = dht_linkfile_subvol (this, inode, buf, + xattr); + gf_log (this->name, GF_LOG_DEBUG, + "found on %s linkfile %s (-> %s)", + subvol->name, loc->path, + link_subvol ? link_subvol->name : "''"); + goto unlock; + } + + /* non linkfile GFID takes precedence */ + uuid_copy (local->gfid, buf->ia_gfid); if (is_dir) { local->dir_count++; @@ -400,189 +998,183 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!local->cached_subvol) { /* found one file */ - dht_stat_merge (this, &local->stbuf, buf, + dht_iatt_merge (this, &local->stbuf, buf, subvol); local->xattr = dict_ref (xattr); local->cached_subvol = subvol; gf_log (this->name, GF_LOG_DEBUG, "found on %s file %s", subvol->name, loc->path); + + dht_iatt_merge (this, &local->postparent, + postparent, subvol); } else { - gf_log (this->name, GF_LOG_DEBUG, + /* This is where we need 'rename' both entries logic */ + gf_log (this->name, GF_LOG_WARNING, "multiple subvolumes (%s and %s) have " - "file %s", local->cached_subvol->name, + "file %s (preferably rename the file " + "in the backend, and do a fresh lookup)", + local->cached_subvol->name, subvol->name, local->loc.path); } } - } + } unlock: - UNLOCK (&frame->lock); - - if (is_linkfile) { - gf_log (this->name, GF_LOG_DEBUG, - "deleting stale linkfile %s on %s", - loc->path, subvol->name); - dht_linkfile_unlink (frame, this, subvol, loc); - } - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - hashed_subvol = local->hashed_subvol; - cached_subvol = local->cached_subvol; - - if (local->file_count && local->dir_count) { - gf_log (this->name, GF_LOG_ERROR, - "path %s exists as a file on one subvolume " - "and directory on another. " - "Please fix it manually", - loc->path); - DHT_STACK_UNWIND (frame, -1, EIO, NULL, NULL, NULL); - return 0; - } - - if (local->dir_count) { - dht_lookup_directory (frame, this, &local->loc); - return 0; - } - - if (!cached_subvol) { - DHT_STACK_UNWIND (frame, -1, ENOENT, NULL, NULL, NULL); - return 0; - } - - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "cannot create linkfile file for %s on %s: " - "hashed subvolume cannot be found.", - loc->path, cached_subvol->name); - - local->op_ret = 0; - local->op_errno = 0; - - ret = dht_layout_inode_set (frame->this, cached_subvol, - local->inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to set layout for subvol %s", - cached_subvol ? cached_subvol->name : - "<nil>"); - local->op_ret = -1; - local->op_errno = EINVAL; - } + UNLOCK (&frame->lock); - DHT_STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, local->xattr); + if (is_linkfile) { + ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count); + /* Delete the linkfile only if there are no open fds on it. + if there is a open-fd, it may be in migration */ + if (!ret && (fd_count == 0)) { + gf_log (this->name, GF_LOG_INFO, + "deleting stale linkfile %s on %s", + loc->path, subvol->name); + STACK_WIND (frame, dht_lookup_unlink_cbk, + subvol, subvol->fops->unlink, loc, 0, NULL); return 0; } + } - gf_log (this->name, GF_LOG_DEBUG, - "linking file %s existing on %s to %s (hash)", - loc->path, cached_subvol->name, - hashed_subvol->name); - - dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, - cached_subvol, hashed_subvol, loc); - } + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + dht_lookup_everywhere_done (frame, this); + } - return 0; +out: + return ret; } int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc) { - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int i = 0; - int call_cnt = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int call_cnt = 0; - conf = this->private; - local = frame->local; + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", this->private, out); + GF_VALIDATE_OR_GOTO ("dht", loc, out); - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + conf = this->private; + local = frame->local; - if (!local->inode) - local->inode = inode_ref (loc->inode); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_everywhere_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - loc, local->xattr_req); - } + if (!local->inode) + local->inode = inode_ref (loc->inode); - return 0; + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_everywhere_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + loc, local->xattr_req); + } + + return 0; +out: + DHT_STACK_UNWIND (lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); +err: + return -1; } int dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - call_frame_t *prev = NULL; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - xlator_t *subvol = NULL; - loc_t *loc = NULL; - dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + dht_conf_t *conf = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, unwind); + GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind); + GF_VALIDATE_OR_GOTO ("dht", this->private, unwind); + GF_VALIDATE_OR_GOTO ("dht", cookie, unwind); prev = cookie; - subvol = prev->this; - conf = this->private; - local = frame->local; - loc = &local->loc; + subvol = prev->this; + conf = this->private; + local = frame->local; + loc = &local->loc; if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "lookup of %s on %s (following linkfile) failed (%s)", - local->loc.path, subvol->name, strerror (op_errno)); - goto err; - } + gf_log (this->name, GF_LOG_INFO, + "lookup of %s on %s (following linkfile) failed (%s)", + local->loc.path, subvol->name, strerror (op_errno)); + + /* If cached subvol returned ENOTCONN, do not do + lookup_everywhere. We need to make sure linkfile does not get + removed, which can take away the namespace, and subvol is + anyways down. */ + + if (op_errno != ENOTCONN) + goto err; + else + goto unwind; + } if (check_is_dir (inode, stbuf, xattr)) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) reached dir", local->loc.path, subvol->name); goto err; } - if (check_is_linkfile (inode, stbuf, xattr)) { - gf_log (this->name, GF_LOG_DEBUG, + if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { + gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) reached link", local->loc.path, subvol->name); goto err; } - if ((stbuf->st_nlink == 1) - && (conf->unhashed_sticky_bit)) { - stbuf->st_mode |= S_ISVTX; - } - dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); - - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + if (uuid_compare (local->gfid, stbuf->ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid different on data file on %s", + local->loc.path, subvol->name); + goto err; + } - inode_ctx_put (inode, this, (uint64_t)(long)layout); + if ((stbuf->ia_nlink == 1) + && (conf && conf->unhashed_sticky_bit)) { + stbuf->ia_prot.sticky = 1; + } -out: - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + ret = dht_layout_preset (this, prev->this, inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "failed to set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + +unwind: + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); return 0; err: dht_lookup_everywhere (frame, this, loc); - +out: return 0; } @@ -594,21 +1186,38 @@ dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc) int i = 0; dht_conf_t *conf = NULL; dht_local_t *local = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, unwind); + GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind); + GF_VALIDATE_OR_GOTO ("dht", this->private, unwind); + GF_VALIDATE_OR_GOTO ("dht", loc, unwind); conf = this->private; local = frame->local; call_cnt = conf->subvolume_cnt; local->call_cnt = call_cnt; - + local->layout = dht_layout_new (this, conf->subvolume_cnt); if (!local->layout) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - DHT_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + goto unwind; } - + + if (local->xattr != NULL) { + dict_unref (local->xattr); + local->xattr = NULL; + } + + if (!uuid_is_null (local->gfid)) { + ret = dict_set_static_bin (local->xattr_req, "gfid-req", + local->gfid, 16); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set gfid", local->loc.path); + } + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_lookup_dir_cbk, conf->subvolumes[i], @@ -616,23 +1225,35 @@ dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc) &local->loc, local->xattr_req); } return 0; +unwind: + DHT_STACK_UNWIND (lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); +out: + return 0; + } int dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - dht_layout_t *layout = NULL; - char is_linkfile = 0; - char is_dir = 0; - xlator_t *subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - loc_t *loc = NULL; - call_frame_t *prev = NULL; + char is_linkfile = 0; + char is_dir = 0; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + call_frame_t *prev = NULL; + int ret = 0; + dht_layout_t *parent_layout = NULL; + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + GF_VALIDATE_OR_GOTO ("dht", this->private, out); conf = this->private; @@ -640,76 +1261,140 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; loc = &local->loc; - if (ENTRY_MISSING (op_ret, op_errno)) { - if (conf->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } - } + /* This is required for handling stale linkfile deletion, + * or any more call which happens from this 'loc'. + */ + if (!op_ret && uuid_is_null (local->gfid)) + memcpy (local->gfid, stbuf->ia_gfid, 16); + + if (ENTRY_MISSING (op_ret, op_errno)) { + gf_log (this->name, GF_LOG_TRACE, "Entry %s missing on subvol" + " %s", loc->path, prev->this->name); + if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) && + (loc->parent)) { + ret = dht_inode_ctx_layout_get (loc->parent, this, + &parent_layout); + if (ret || !parent_layout) + goto out; + if (parent_layout->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + } + } - if (op_ret == 0) { - is_dir = check_is_dir (inode, stbuf, xattr); - if (is_dir) { - local->inode = inode_ref (inode); - local->xattr = dict_ref (xattr); - } - } + if (op_ret == 0) { + is_dir = check_is_dir (inode, stbuf, xattr); + if (is_dir) { + local->inode = inode_ref (inode); + local->xattr = dict_ref (xattr); + } + } - if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) { + if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) { dht_lookup_directory (frame, this, &local->loc); return 0; - } - - if (op_ret == -1) + } + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "Lookup of %s for subvolume" + " %s failed with error %s", loc->path, prev->this->name, + strerror (op_errno)); goto out; + } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); - is_dir = check_is_dir (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); - if (!is_dir && !is_linkfile) { + if (!is_linkfile) { /* non-directory and not a linkfile */ - dht_itransform (this, prev->this, stbuf->st_ino, - &stbuf->st_ino); - - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - inode_ctx_put (inode, this, (uint64_t)(long)layout); - goto out; - } - - if (is_linkfile) { - subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); - - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "linkfile not having link subvolume. path=%s", - loc->path); - dht_lookup_everywhere (frame, this, loc); - return 0; + ret = dht_layout_preset (this, prev->this, inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "could not set pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; } + goto out; + } - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "linkfile not having link subvolume. path=%s", + loc->path); + dht_lookup_everywhere (frame, this, loc); + return 0; } + STACK_WIND (frame, dht_lookup_linkfile_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); + return 0; out: - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + /* + * FIXME: postparent->ia_size and postparent->st_blocks do not have + * correct values. since, postparent corresponds to a directory these + * two members should have values equal to sum of corresponding values + * from each of the subvolume. See dht_iatt_merge for reference. + */ + + if (!op_ret && local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); +err: return 0; } +/* For directories, check if acl xattrs have been requested (by the acl xlator), + * if not, request for them. These xattrs are needed for dht dir self-heal to + * perform proper self-healing of dirs + */ +void +dht_check_and_set_acl_xattr_req (inode_t *inode, dict_t *xattr_req) +{ + int ret = 0; + + GF_ASSERT (inode); + GF_ASSERT (xattr_req); + + if (inode->ia_type != IA_IFDIR) + return; + + if (!dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR)) { + ret = dict_set_int8 (xattr_req, POSIX_ACL_ACCESS_XATTR, 0); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set key %s", + POSIX_ACL_ACCESS_XATTR); + } + + if (!dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR)) { + ret = dict_set_int8 (xattr_req, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set key %s", + POSIX_ACL_DEFAULT_XATTR); + } + + return; +} int dht_lookup (call_frame_t *frame, xlator_t *this, @@ -717,56 +1402,68 @@ dht_lookup (call_frame_t *frame, xlator_t *this, { xlator_t *subvol = NULL; xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; dht_local_t *local = NULL; - dht_conf_t *conf = NULL; + dht_conf_t *conf = NULL; int ret = -1; int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + loc_t new_loc = {0,}; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - conf = this->private; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + conf = this->private; + if (!conf) + goto err; - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "copying location failed for path=%s", - loc->path); + local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; goto err; } - - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); - } else { - local->xattr_req = dict_new (); - } - hashed_subvol = dht_subvol_get_hashed (this, loc); - cached_subvol = dht_subvol_get_cached (this, loc->inode); + ret = dht_filter_loc_subvol_key (this, loc, &new_loc, + &hashed_subvol); + if (ret) { + loc_wipe (&local->loc); + ret = loc_dup (&new_loc, &local->loc); - local->cached_subvol = cached_subvol; - local->hashed_subvol = hashed_subvol; + /* we no more need 'new_loc' entries */ + loc_wipe (&new_loc); - if (is_revalidate (loc)) { - layout = dht_layout_get (this, loc->inode); + /* check if loc_dup() is successful */ + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_DEBUG, + "copying location failed for path=%s", + loc->path); + goto err; + } + } + + if (xattr_req) { + local->xattr_req = dict_ref (xattr_req); + } else { + local->xattr_req = dict_new (); + } + + if (uuid_is_null (loc->pargfid) && !uuid_is_null (loc->gfid) && + !__is_root_gfid (loc->inode->gfid)) { + local->cached_subvol = NULL; + dht_discover (frame, this, loc); + return 0; + } + + if (!hashed_subvol) + hashed_subvol = dht_subvol_get_hashed (this, loc); + local->hashed_subvol = hashed_subvol; + if (is_revalidate (loc)) { + layout = local->layout; if (!layout) { gf_log (this->name, GF_LOG_DEBUG, "revalidate without cache. path=%s", @@ -775,67 +1472,93 @@ dht_lookup (call_frame_t *frame, xlator_t *this, goto err; } - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_TRACE, - "incomplete layout failure for path=%s", - loc->path); - op_errno = ESTALE; - goto err; - } + if (layout->gen && (layout->gen < conf->gen)) { + gf_log (this->name, GF_LOG_TRACE, + "incomplete layout failure for path=%s", + loc->path); + + dht_layout_unref (this, local->layout); + local->layout = NULL; + local->cached_subvol = NULL; + goto do_fresh_lookup; + } - local->inode = inode_ref (loc->inode); - local->st_ino = loc->inode->ino; - - local->call_cnt = layout->cnt; - call_cnt = local->call_cnt; - - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, - * revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - - for (i = 0; i < layout->cnt; i++) { + local->inode = inode_ref (loc->inode); + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32 (local->xattr_req, + conf->xattr_name, 4 * 4); + + if (IA_ISDIR (local->inode->ia_type)) { + local->call_cnt = call_cnt = conf->subvolume_cnt; + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_revalidate_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + loc, local->xattr_req); + } + return 0; + } + + call_cnt = local->call_cnt = layout->cnt; + + /* need it for self-healing linkfiles which is + 'in-migration' state */ + ret = dict_set_uint32 (local->xattr_req, + GLUSTERFS_OPEN_FD_COUNT, 4); + + /* need it for dir self-heal */ + dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); + + for (i = 0; i < call_cnt; i++) { subvol = layout->list[i].xlator; - + STACK_WIND (frame, dht_revalidate_cbk, subvol, subvol->fops->lookup, - loc, local->xattr_req); + &local->loc, local->xattr_req); - if (!--call_cnt) - break; } } else { - /* TODO: remove the hard-coding */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + do_fresh_lookup: + /* TODO: remove the hard-coding */ + ret = dict_set_uint32 (local->xattr_req, + conf->xattr_name, 4 * 4); - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + ret = dict_set_uint32 (local->xattr_req, + conf->link_xattr_name, 256); + + /* need it for self-healing linkfiles which is + 'in-migration' state */ + ret = dict_set_uint32 (local->xattr_req, + GLUSTERFS_OPEN_FD_COUNT, 4); + + /* need it for dir self-heal */ + dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s, " - "checking on all the subvols to see if " - "it is a directory", loc->path); - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - return 0; + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s, " + "checking on all the subvols to see if " + "it is a directory", loc->path); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, + conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; } STACK_WIND (frame, dht_lookup_cbk, @@ -846,1313 +1569,1696 @@ dht_lookup (call_frame_t *frame, xlator_t *this, return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); return 0; } int -dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct stat *stbuf) +dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + local = frame->local; + prev = cookie; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_ret = -1; + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } - dht_stat_merge (this, &local->stbuf, stbuf, prev->this); - - if (local->inode) - local->stbuf.st_ino = local->inode->ino; - local->op_ret = 0; - } + local->op_ret = 0; + + local->postparent = *postparent; + local->preparent = *preparent; + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + } unlock: - UNLOCK (&frame->lock); + UNLOCK (&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); + DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); return 0; } int -dht_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc) +dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int i = 0; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *cached_subvol = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + local = frame->local; + prev = cookie; - layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + LOCK (&frame->lock); + { + if ((op_ret == -1) && !((op_errno == ENOENT) || + (op_errno == ENOTCONN))) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); - local->inode = inode_ref (loc->inode); - local->call_cnt = layout->cnt; + if (local->op_ret == -1) + goto err; - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; + cached_subvol = dht_subvol_get_cached (this, local->loc.inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", + local->loc.path); + local->op_errno = EINVAL; + goto err; + } - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->stat, - loc); - } + STACK_WIND (frame, dht_unlink_cbk, + cached_subvol, cached_subvol->fops->unlink, + &local->loc, local->flags, NULL); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); - - return 0; + DHT_STACK_UNWIND (unlink, frame, -1, local->op_errno, + NULL, NULL, NULL); + return 0; } - int -dht_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) +dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int i = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - layout = dht_layout_get (this, fd->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + local = frame->local; + prev = cookie; - local->inode = inode_ref (fd->inode); - local->call_cnt = layout->cnt;; + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->fstat, - fd); - } + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); - return 0; + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, NULL); + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} - return 0; +static void +fill_layout_info (dht_layout_t *layout, char *buf) +{ + int i = 0; + char tmp_buf[128] = {0,}; + + for (i = 0; i < layout->cnt; i++) { + snprintf (tmp_buf, 128, "(%s %u %u)", + layout->list[i].xlator->name, + layout->list[i].start, + layout->list[i].stop); + if (i) + strcat (buf, " "); + strcat (buf, tmp_buf); + } } +void +dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local, + char *xattr_buf, int32_t alloc_len, + int flag, char *layout_buf) +{ + if (flag && local->xattr_val) + snprintf (xattr_buf, alloc_len, + "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))", + this->name, local->xattr_val, this->name, + layout_buf); + else if (local->xattr_val) + snprintf (xattr_buf, alloc_len, + "(<"DHT_PATHINFO_HEADER"%s> %s)", + this->name, local->xattr_val); + else if (flag) + snprintf (xattr_buf, alloc_len, "(%s-layout %s)", + this->name, layout_buf); +} int -dht_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode) +dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this, + int op_errno) { - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; + int ret = -1; + char *value = NULL; + int32_t plen = 0; + ret = dict_get_str (xattr, local->xsel, &value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Subvolume %s returned -1 (%s)", this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + goto out; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + local->alloc_len += strlen(value); - layout = dht_layout_get (this, loc->inode); + if (!local->xattr_val) { + local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); + local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + if (local->xattr_val) { + plen = strlen (local->xattr_val); + if (plen) { + /* extra byte(s) for \0 to be safe */ + local->alloc_len += (plen + 2); + local->xattr_val = GF_REALLOC (local->xattr_val, + local->alloc_len); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + (void) strcat (local->xattr_val, value); + (void) strcat (local->xattr_val, " "); + local->op_ret = 0; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_DEBUG, - "memory allocation failed :("); - goto err; - } + ret = 0; - local->inode = inode_ref (loc->inode); - local->call_cnt = layout->cnt; + out: + return ret; +} - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_attr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->chmod, - loc, mode); - } +int +dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, + gf_boolean_t flag) +{ + int ret = -1; + char *xattr_buf = NULL; + char layout_buf[8192] = {0,}; - return 0; + if (flag) + fill_layout_info (local->layout, layout_buf); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + *dict = dict_new (); + if (!*dict) + goto out; - return 0; -} + local->xattr_val[strlen (local->xattr_val) - 1] = '\0'; + + /* we would need max this many bytes to create xattr string + * extra 40 bytes is just an estimated amount of additional + * space required as we include translator name and some + * spaces, brackets etc. when forming the pathinfo string. + * + * For node-uuid we just don't have all the pretty formatting, + * but since this is a generic routine for pathinfo & node-uuid + * we dont have conditional space allocation and try to be + * generic + */ + local->alloc_len += (2 * strlen (this->name)) + + strlen (layout_buf) + + 40; + xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!xattr_buf) + goto out; + + if (XATTR_IS_PATHINFO (local->xsel)) { + (void) dht_fill_pathinfo_xattr (this, local, xattr_buf, + local->alloc_len, flag, + layout_buf); + } else if (XATTR_IS_NODE_UUID (local->xsel)) { + (void) snprintf (xattr_buf, local->alloc_len, "%s", + local->xattr_val); + } else { + gf_log (this->name, GF_LOG_WARNING, + "Unknown local->xsel (%s)", local->xsel); + GF_FREE (xattr_buf); + goto out; + } + ret = dict_set_dynstr (*dict, local->xsel, xattr_buf); + if (ret) + GF_FREE (xattr_buf); + GF_FREE (local->xattr_val); + + out: + return ret; +} int -dht_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid) +dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; + int ret = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + dict_t *dict = NULL; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + local = frame->local; - layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + if (op_ret < 0) { + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr err (%s) for dir", + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + goto unlock; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); + } + unlock: + UNLOCK (&frame->lock); - local->inode = inode_ref (loc->inode); - local->call_cnt = layout->cnt; + if (!is_last_call (this_call_cnt)) + goto out; - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_attr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->chown, - loc, uid, gid); - } + /* -- last call: do patch ups -- */ - return 0; + if (local->op_ret == -1) { + goto unwind; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true); + if (ret) + goto unwind; - return 0; -} + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); + out: + return 0; +} int -dht_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode) +dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; + dht_local_t *local = NULL; + int ret = 0; + dict_t *dict = NULL; + call_frame_t *prev = NULL; + gf_boolean_t flag = _gf_true; + local = frame->local; + prev = cookie; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 " + "(%s)", prev->this->name, strerror (op_errno)); + goto unwind; + } + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); + goto unwind; + } - layout = dht_layout_get (this, fd->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + flag = (local->layout->cnt > 1) ? _gf_true : _gf_false; - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag); + if (ret) + goto unwind; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; - local->inode = inode_ref (fd->inode); - local->call_cnt = layout->cnt; + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, + NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_attr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->fchmod, - fd, mode); - } + return 0; +} - return 0; +int +dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ + int ret = 0; + char *value = NULL; + + if (op_ret != -1) { + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); + if (!ret) { + ret = dict_set_str (xattr, GF_XATTR_LINKINFO_KEY, value); + if (!ret) + gf_log (this->name, GF_LOG_TRACE, + "failed to set linkinfo"); + } + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); - return 0; + return 0; } - int -dht_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid) +dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; + int this_call_cnt = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); + VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - layout = dht_layout_get (this, fd->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + conf = this->private; + local = frame->local; - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + this_call_cnt = dht_frame_return (frame); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + if (!xattr || (op_ret == -1)) + goto out; - local->inode = inode_ref (fd->inode); - local->call_cnt = layout->cnt; + if (dict_get (xattr, conf->xattr_name)) { + dict_del (xattr, conf->xattr_name); + } - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_attr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->fchown, - fd, uid, gid); - } + if (frame->root->pid >= 0 ) { + GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); + GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); + } - return 0; + local->op_ret = 0; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + if (!local->xattr) { + local->xattr = dict_copy_with_ref (xattr, NULL); + } else { + dht_aggregate_xattr (local->xattr, xattr); + } +out: + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, NULL); + } + return 0; +} - return 0; +int32_t +dht_getxattr_unwind (call_frame_t *frame, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) +{ + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; } int -dht_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]) +dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; + int this_call_cnt = 0; + dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + local = frame->local; - layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + if (op_ret != -1) { + if (local->xattr) + dict_unref (local->xattr); + local->xattr = dict_ref (xattr); - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for path=%s", loc->path); - op_errno = EINVAL; - goto err; + if (local->xattr_req) + dict_unref (local->xattr_req); + local->xattr_req = dict_ref (xdata); } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, local->xattr_req); } - local->inode = inode_ref (loc->inode); - local->call_cnt = layout->cnt; + return 0; +} - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_attr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->utimens, - loc, tv); - } - return 0; +int +dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key, dict_t *xdata) +{ + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int cnt = 0; + xlator_t *subvol = NULL; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + local = frame->local; + layout = local->layout; + + cnt = local->call_cnt = layout->cnt; + + local->op_ret = -1; + local->op_errno = ENODATA; + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, + subvol, subvol->fops->getxattr, + loc, key, xdata); + } return 0; } int -dht_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) +dht_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key, dict_t *xdata) +#define DHT_IS_DIR(layout) (layout->cnt > 1) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t **sub_volumes = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + conf = this->private; - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; + local = dht_local_init (frame, loc, NULL, GF_FOP_GETXATTR); + if (!local) { + op_errno = ENOMEM; - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->truncate, - loc, offset); + goto err; + } - return 0; + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout is NULL"); + op_errno = ENOENT; + goto err; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + if (key) { + local->key = gf_strdup (key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + } - return 0; -} + if (key && + (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) + && DHT_IS_DIR(layout)) { + dht_getxattr_get_real_filename (frame, this, loc, key, xdata); + return 0; + } + /* for file use cached subvolume (obviously!): see if {} + * below + * for directory: + * wind to all subvolumes and exclude subvolumes which + * return ENOTCONN (in callback) + * + * NOTE: Don't trust inode here, as that may not be valid + * (until inode_link() happens) + */ + if (key && DHT_IS_DIR(layout) && + (XATTR_IS_PATHINFO (key) + || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) { + (void) strncpy (local->xsel, key, 256); + cnt = local->call_cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_vgetxattr_dir_cbk, + subvol, subvol->fops->getxattr, + loc, key, NULL); + } + return 0; + } -int -dht_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + /* node-uuid or pathinfo for files */ + if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0) + || XATTR_IS_PATHINFO (key))) { + cached_subvol = local->cached_subvol; + (void) strncpy (local->xsel, key, 256); + local->call_cnt = 1; + STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol, + cached_subvol->fops->getxattr, loc, key, NULL); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + return 0; + } - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) { + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "cached subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } - local->inode = inode_ref (fd->inode); - local->call_cnt = 1; + if (hashed_subvol == cached_subvol) { + op_errno = ENODATA; + goto err; + } + if (hashed_subvol) { + STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, + hashed_subvol->fops->getxattr, loc, + GF_XATTR_PATHINFO_KEY, NULL); + return 0; + } + op_errno = ENODATA; + goto err; + } - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->ftruncate, - fd, offset); + if (key && (!strcmp (GF_XATTR_MARKER_KEY, key)) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { + cnt = layout->cnt; + } else { + cnt = 1; + } - return 0; + sub_volumes = alloca ( cnt * sizeof (xlator_t *)); + for (i = 0; i < cnt; i++) + *(sub_volumes + i) = layout->list[i].xlator; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + if (cluster_getmarkerattr (frame, this, loc, key, + local, dht_getxattr_unwind, + sub_volumes, cnt, + MARKER_UUID_TYPE, marker_uuid_default_gauge, + conf->vol_uuid)) { + op_errno = EINVAL; + goto err; + } - return 0; -} + return 0; + } + if (key && !strcmp (GF_XATTR_QUOTA_LIMIT_LIST, key)) { + /* quota hardlimit and aggregated size of a directory is stored + * in inode contexts of each brick. Hence its good enough that + * we send getxattr for this key to any brick. + */ + local->call_cnt = 1; + subvol = dht_first_up_subvol (this); + STACK_WIND (frame, dht_getxattr_cbk, subvol, + subvol->fops->getxattr, loc, key, xdata); + return 0; + } -int -dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + if (key && *conf->vol_uuid) { + if ((match_uuid_local (key, conf->vol_uuid) == 0) && + (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { + cnt = layout->cnt; + } else { + cnt = 1; + } + sub_volumes = alloca ( cnt * sizeof (xlator_t *)); + for (i = 0; i < cnt; i++) + sub_volumes[i] = layout->list[i].xlator; + + if (cluster_getmarkerattr (frame, this, loc, key, + local, dht_getxattr_unwind, + sub_volumes, cnt, + MARKER_XTIME_TYPE, + marker_xtime_default_gauge, + conf->vol_uuid)) { + op_errno = EINVAL; + goto err; + } + return 0; + } + } - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + if (DHT_IS_DIR(layout)) { + cnt = local->call_cnt = layout->cnt; + } else { + cnt = local->call_cnt = 1; + } - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_cbk, + subvol, subvol->fops->getxattr, + loc, key, NULL); + } + return 0; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); return 0; } - +#undef DHT_IS_DIR int -dht_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) +dht_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *key, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + local = dht_local_init (frame, NULL, fd, GF_FOP_FGETXATTR); + if (!local) { + op_errno = ENOMEM; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + goto err; + } - local->call_cnt = 1; + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout is NULL"); + op_errno = ENOENT; + goto err; + } - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->access, - loc, mask); + if (key) { + local->key = gf_strdup (key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + } - return 0; + if ((fd->inode->ia_type == IA_IFDIR) + && key + && (strncmp (key, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY) != 0))) { + cnt = local->call_cnt = layout->cnt; + } else { + cnt = local->call_cnt = 1; + } + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_cbk, + subvol, subvol->fops->fgetxattr, + fd, key, NULL); + } + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } - int -dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, const char *path) +dht_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *xattr, int flags, dict_t *xdata) { - DHT_STACK_UNWIND (frame, op_ret, op_errno, path); + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int op_errno = EINVAL; + dht_conf_t *conf = NULL; - return 0; -} + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); + conf = this->private; -int -dht_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) -{ - xlator_t *subvol = NULL; - int op_errno = -1; + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); + local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + local->call_cnt = 1; - STACK_WIND (frame, dht_readlink_cbk, - subvol, subvol->fops->readlink, - loc, size); + STACK_WIND (frame, dht_err_cbk, subvol, subvol->fops->fsetxattr, + fd, xattr, flags, NULL); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } -int -dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) +static int +dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { - DHT_STACK_UNWIND (frame, op_ret, op_errno, xattr); + DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); return 0; } - int -dht_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) +dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; + int i = -1; + int ret = -1; + char *value = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + local = frame->local; + prev = cookie; + conf = this->private; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + if (op_ret == -1) + goto out; - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - STACK_WIND (frame, dht_getxattr_cbk, - subvol, subvol->fops->getxattr, - loc, key); + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); + if (ret) + goto out; - return 0; + if (!strcmp (value, local->key)) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev->this) + conf->decommissioned_bricks[i] = prev->this; + } + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); +out: + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP, NULL); + } + return 0; - return 0; } - int dht_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr, int flags) + loc_t *loc, dict_t *xattr, int flags, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int op_errno = EINVAL; + int ret = -1; + data_t *tmp = NULL; + uint32_t dir_spread = 0; + char value[4096] = {0,}; + gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + conf = this->private; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); - local->call_cnt = 1; + local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->setxattr, - loc, xattr, flags); + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } - return 0; + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + local->call_cnt = call_cnt = layout->cnt; - return 0; -} + tmp = dict_get (xattr, "distribute.migrate-data"); + if (tmp) { + if (IA_ISDIR (loc->inode->ia_type)) { + op_errno = ENOTSUP; + goto err; + } + /* TODO: need to interpret the 'value' for more meaning + (ie, 'target' subvolume given there, etc) */ + memcpy (value, tmp->data, tmp->len); + if (strcmp (value, "force") == 0) + forced_rebalance = + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; -int -dht_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + if (conf->decommission_in_progress) + forced_rebalance = GF_DHT_MIGRATE_HARDLINK; + local->rebalance.target_node = dht_subvol_get_hashed (this, loc); + if (!local->rebalance.target_node) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + local->rebalance.from_subvol = local->cached_subvol; - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + if (local->rebalance.target_node == local->rebalance.from_subvol) { + op_errno = EEXIST; + goto err; + } + if (local->rebalance.target_node) { + local->flags = forced_rebalance; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + ret = dht_start_rebalance_task (this, frame); + if (!ret) + return 0; - local->call_cnt = 1; + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to create a new synctask", + loc->path); + } + op_errno = EINVAL; + goto err; - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->removexattr, - loc, key); + } - return 0; + tmp = dict_get (xattr, "decommission-brick"); + if (tmp) { + /* This operation should happen only on '/' */ + if (!__is_root_gfid (loc->inode->gfid)) { + op_errno = ENOTSUP; + goto err; + } + + memcpy (value, tmp->data, ((tmp->len < 4095) ? tmp->len : 4095)); + local->key = gf_strdup (value); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0 ; i < conf->subvolume_cnt; i++) { + /* Get the pathinfo, and then compare */ + STACK_WIND (frame, dht_checking_pathinfo_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, + loc, GF_XATTR_PATHINFO_KEY, NULL); + } + return 0; + } + + tmp = dict_get (xattr, GF_XATTR_FIX_LAYOUT_KEY); + if (tmp) { + gf_log (this->name, GF_LOG_INFO, + "fixing the layout of %s", loc->path); + + ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; + } + + tmp = dict_get (xattr, "distribute.directory-spread-count"); + if (tmp) { + /* Setxattr value is packed as 'binary', not string */ + memcpy (value, tmp->data, ((tmp->len < 4095)?tmp->len:4095)); + ret = gf_string2uint32 (value, &dir_spread); + if (!ret && ((dir_spread <= conf->subvolume_cnt) && + (dir_spread > 0))) { + layout->spread_cnt = dir_spread; + + ret = dht_fix_directory_layout (frame, + dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; + } + gf_log (this->name, GF_LOG_ERROR, + "wrong 'directory-spread-count' value (%s)", value); + op_errno = ENOTSUP; + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_err_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->setxattr, + loc, xattr, flags, xdata); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } int -dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) +dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + local = frame->local; + prev = cookie; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } - local->op_ret = 0; - } + local->op_ret = 0; + } unlock: - UNLOCK (&frame->lock); + UNLOCK (&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->fd); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (removexattr, frame, local->op_ret, + local->op_errno, NULL); + } return 0; } int -dht_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, fd_t *fd) +dht_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key, dict_t *xdata) { - xlator_t *subvol = NULL; - int ret = -1; + xlator_t *subvol = NULL; int op_errno = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = NULL; + int i; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + conf = this->private; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); - local->fd = fd_ref (fd); - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); - local->call_cnt = 1; + local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - STACK_WIND (frame, dht_fd_cbk, - subvol, subvol->fops->open, - loc, flags, fd); + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } - return 0; + layout = local->layout; + if (!local->layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup (key); - return 0; -} + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_removexattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->removexattr, + loc, key, NULL); + } + return 0; -int -dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - struct iovec *vector, int count, struct stat *stbuf, - struct iobref *iobref) -{ - DHT_STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, - iobref); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } - int -dht_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) +dht_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *key, dict_t *xdata) { - xlator_t *subvol = NULL; + xlator_t *subvol = NULL; int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = 0; + int i; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + conf = this->private; - STACK_WIND (frame, dht_readv_cbk, - subvol, subvol->fops->readv, - fd, size, off); + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); - return 0; + VALIDATE_OR_GOTO (frame, err); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL, NULL); + local = dht_local_init (frame, NULL, fd, GF_FOP_FREMOVEXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - return 0; -} + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for inode=%s", + uuid_utoa (fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!local->layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for inode=%s", uuid_utoa (fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup (key); -int -dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct stat *stbuf) -{ - DHT_STACK_UNWIND (frame, op_ret, op_errno, stbuf); + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_removexattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fremovexattr, + fd, key, NULL); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); return 0; } int -dht_writev (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iovec *vector, int count, off_t off, - struct iobref *iobref) +dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + local = frame->local; + prev = cookie; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); - STACK_WIND (frame, dht_writev_cbk, - subvol, subvol->fops->writev, - fd, vector, count, off, iobref); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, + local->fd, NULL); - return 0; + return 0; +} -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0); +/* + * dht_normalize_stats - + */ +static void +dht_normalize_stats (struct statvfs *buf, unsigned long bsize, + unsigned long frsize) +{ + double factor = 0; - return 0; -} + if (buf->f_bsize != bsize) { + buf->f_bsize = bsize; + } + if (buf->f_frsize != frsize) { + factor = ((double) buf->f_frsize) / frsize; + buf->f_frsize = frsize; + buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); + buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); + buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); + + } +} int -dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct statvfs *statvfs, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + int bsize = 0; + int frsize = 0; + int8_t quota_deem_statfs = 0; + GF_UNUSED int ret = 0; + unsigned long new_usage = 0; + unsigned long cur_usage = 0; + ret = dict_get_int8 (xdata, "quota-deem-statfs", "a_deem_statfs); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + local = frame->local; - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + if (!statvfs) { + op_errno = EINVAL; + local->op_ret = -1; + goto unlock; + } + local->op_ret = 0; + + if (quota_deem_statfs) { + new_usage = statvfs->f_blocks - statvfs->f_bfree; + cur_usage = local->statvfs.f_blocks - local->statvfs.f_bfree; + /* We take the maximux of the usage from the subvols */ + if (new_usage >= cur_usage) + local->statvfs = *statvfs; + goto unlock; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + if (local->statvfs.f_bsize != 0) { + bsize = max(local->statvfs.f_bsize, statvfs->f_bsize); + frsize = max(local->statvfs.f_frsize, statvfs->f_frsize); + dht_normalize_stats(&local->statvfs, bsize, frsize); + dht_normalize_stats(statvfs, bsize, frsize); + } else { + local->statvfs.f_bsize = statvfs->f_bsize; + local->statvfs.f_frsize = statvfs->f_frsize; + } - local->fd = fd_ref (fd); - local->call_cnt = 1; + local->statvfs.f_blocks += statvfs->f_blocks; + local->statvfs.f_bfree += statvfs->f_bfree; + local->statvfs.f_bavail += statvfs->f_bavail; + local->statvfs.f_files += statvfs->f_files; + local->statvfs.f_ffree += statvfs->f_ffree; + local->statvfs.f_favail += statvfs->f_favail; + local->statvfs.f_fsid = statvfs->f_fsid; + local->statvfs.f_flag = statvfs->f_flag; + local->statvfs.f_namemax = statvfs->f_namemax; - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->flush, fd); - return 0; + } +unlock: + UNLOCK (&frame->lock); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); - return 0; + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, + &local->statvfs, xdata); + + return 0; } int -dht_fsync (call_frame_t *frame, xlator_t *this, - fd_t *fd, int datasync) +dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - xlator_t *subvol = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; int op_errno = -1; - dht_local_t *local = NULL; - + int i = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + conf = this->private; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - local->call_cnt = 1; + local = dht_local_init (frame, NULL, NULL, GF_FOP_STATFS); + if (!local) { + op_errno = ENOMEM; + goto err; + } - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->fsync, - fd, datasync); + if (IA_ISDIR (loc->inode->ia_type)) { + local->call_cnt = conf->subvolume_cnt; - return 0; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_statfs_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, loc, + xdata); + } + return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } - return 0; -} + local->call_cnt = 1; + STACK_WIND (frame, dht_statfs_cbk, + subvol, subvol->fops->statfs, loc, xdata); -int -dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct flock *flock) -{ - DHT_STACK_UNWIND (frame, op_ret, op_errno, flock); + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } int -dht_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int cmd, struct flock *flock) +dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; int op_errno = -1; - + int i = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, dht_lk_cbk, - subvol, subvol->fops->lk, - fd, cmd, flock); - - return 0; - -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); - - return 0; -} - - -int -dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; + conf = this->private; + local = dht_local_init (frame, loc, fd, GF_FOP_OPENDIR); + if (!local) { + op_errno = ENOMEM; - local = frame->local; + goto err; + } - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - goto unlock; - } - local->op_ret = 0; - - /* TODO: normalize sizes */ - local->statvfs.f_bsize = statvfs->f_bsize; - local->statvfs.f_frsize = statvfs->f_frsize; - - local->statvfs.f_blocks += statvfs->f_blocks; - local->statvfs.f_bfree += statvfs->f_bfree; - local->statvfs.f_bavail += statvfs->f_bavail; - local->statvfs.f_files += statvfs->f_files; - local->statvfs.f_ffree += statvfs->f_ffree; - local->statvfs.f_favail += statvfs->f_favail; - local->statvfs.f_fsid = statvfs->f_fsid; - local->statvfs.f_flag = statvfs->f_flag; - local->statvfs.f_namemax = statvfs->f_namemax; + local->call_cnt = conf->subvolume_cnt; - } -unlock: - UNLOCK (&frame->lock); + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_fd_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + loc, fd, xdata); + } + return 0; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->statvfs); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL); return 0; } int -dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *next_subvol = NULL; + off_t next_offset = 0; + int count = 0; + dht_layout_t *layout = 0; + dht_conf_t *conf = NULL; + xlator_t *subvol = 0; + int ret = 0; + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; + conf = this->private; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + if (op_ret < 0) + goto done; - conf = this->private; + if (!local->layout) + local->layout = dht_layout_get (this, local->fd->inode); - local = dht_local_init (frame); - local->call_cnt = conf->subvolume_cnt; + layout = local->layout; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_statfs_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, loc); - } + list_for_each_entry (orig_entry, (&orig_entries->list), list) { + next_offset = orig_entry->d_off; + if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) && + (prev->this != local->first_up_subvol)) { + continue; + } + if (check_is_linkfile (NULL, (&orig_entry->d_stat), + orig_entry->dict, + conf->link_xattr_name)) { + continue; + } - return 0; + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + goto unwind; + } - return 0; -} + /* Do this if conf->search_unhashed is set to "auto" */ + if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) { + subvol = dht_layout_search (this, layout, + orig_entry->d_name); + if (!subvol || (subvol != prev->this)) { + /* TODO: Count the number of entries which need + linkfile to prove its existence in fs */ + layout->search_unhashed++; + } + } + dht_itransform (this, prev->this, orig_entry->d_off, + &entry->d_off); -int -dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) -{ - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - int i = -1; + entry->d_stat = orig_entry->d_stat; + entry->d_ino = orig_entry->d_ino; + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + if (orig_entry->dict) + entry->dict = dict_ref (orig_entry->dict); + + /* making sure we set the inode ctx right with layout, + currently possible only for non-directories, so for + directories don't set entry inodes */ + if (!IA_ISDIR(entry->d_stat.ia_type) && orig_entry->inode) { + ret = dht_layout_preset (this, prev->this, + orig_entry->inode); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to link the layout in inode"); + entry->inode = inode_ref (orig_entry->inode); + } else if (orig_entry->inode) { + dht_inode_ctx_time_update (orig_entry->inode, this, + &entry->d_stat, 1); + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + list_add_tail (&entry->list, &entries.list); + count++; + } + op_ret = count; + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + */ + if (prev->this != dht_last_up_subvol (this)) + op_errno = 0; - conf = this->private; +done: + if (count == 0) { + /* non-zero next_offset means that + EOF is not yet hit on the current subvol + */ + if (next_offset == 0) { + next_subvol = dht_subvol_next (this, prev->this); + } else { + next_subvol = prev->this; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + if (!next_subvol) { + goto unwind; + } - local->fd = fd_ref (fd); - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + if (conf->readdir_optimize == _gf_true) { + if (next_subvol != local->first_up_subvol) { + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } + } - local->call_cnt = conf->subvolume_cnt; + STACK_WIND (frame, dht_readdirp_cbk, + next_subvol, next_subvol->fops->readdirp, + local->fd, local->size, next_offset, + local->xattr); + return 0; + } - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_fd_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - loc, fd); - } +unwind: + if (op_ret < 0) + op_ret = 0; - return 0; + DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + gf_dirent_free (&entries); - return 0; + return 0; } + int dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries) -{ - dht_local_t *local = NULL; - gf_dirent_t entries; - gf_dirent_t *orig_entry = NULL; - gf_dirent_t *entry = NULL; - call_frame_t *prev = NULL; - xlator_t *next_subvol = NULL; + int op_ret, int op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) +{ + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *next_subvol = NULL; off_t next_offset = 0; - int count = 0; + int count = 0; + dht_layout_t *layout = 0; + xlator_t *subvol = 0; + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; - INIT_LIST_HEAD (&entries.list); - prev = cookie; - local = frame->local; + if (op_ret < 0) + goto done; + + if (!local->layout) + local->layout = dht_layout_get (this, local->fd->inode); - if (op_ret < 0) - goto done; + layout = local->layout; - list_for_each_entry (orig_entry, (&orig_entries->list), list) { + list_for_each_entry (orig_entry, (&orig_entries->list), list) { next_offset = orig_entry->d_off; - if (check_is_linkfile (NULL, (&orig_entry->d_stat), NULL) - || (check_is_dir (NULL, (&orig_entry->d_stat), NULL) - && (prev->this != dht_first_up_subvol (this)))) { - continue; - } + subvol = dht_layout_search (this, layout, orig_entry->d_name); - entry = gf_dirent_for_name (orig_entry->d_name); - if (!entry) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto unwind; - } + if (!subvol || (subvol == prev->this)) { + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto unwind; + } - dht_itransform (this, prev->this, orig_entry->d_ino, - &entry->d_ino); - dht_itransform (this, prev->this, orig_entry->d_off, - &entry->d_off); + dht_itransform (this, prev->this, orig_entry->d_off, + &entry->d_off); - entry->d_type = orig_entry->d_type; - entry->d_len = orig_entry->d_len; + entry->d_ino = orig_entry->d_ino; + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; - list_add_tail (&entry->list, &entries.list); - count++; - } - op_ret = count; + list_add_tail (&entry->list, &entries.list); + count++; + } + } + op_ret = count; + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + */ + if (prev->this != dht_last_up_subvol (this)) + op_errno = 0; done: - if (count == 0) { + if (count == 0) { /* non-zero next_offset means that EOF is not yet hit on the current subvol */ @@ -2162,771 +3268,911 @@ done: next_subvol = prev->this; } - if (!next_subvol) { - goto unwind; - } + if (!next_subvol) { + goto unwind; + } - STACK_WIND (frame, dht_readdir_cbk, - next_subvol, next_subvol->fops->readdir, - local->fd, local->size, next_offset); - return 0; - } + STACK_WIND (frame, dht_readdir_cbk, + next_subvol, next_subvol->fops->readdir, + local->fd, local->size, next_offset, NULL); + return 0; + } unwind: - if (op_ret < 0) - op_ret = 0; + if (op_ret < 0) + op_ret = 0; - DHT_STACK_UNWIND (frame, op_ret, op_errno, &entries); + DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL); - gf_dirent_free (&entries); + gf_dirent_free (&entries); return 0; } int -dht_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t yoff) +dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, int whichop, dict_t *dict) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; + dht_local_t *local = NULL; int op_errno = -1; - xlator_t *xvol = NULL; - off_t xoff = 0; - + xlator_t *xvol = NULL; + off_t xoff = 0; + int ret = 0; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); - conf = this->private; - - local = dht_local_init (frame); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } + conf = this->private; - local->fd = fd_ref (fd); - local->size = size; + local = dht_local_init (frame, NULL, NULL, whichop); + if (!local) { + op_errno = ENOMEM; + goto err; + } - dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + local->fd = fd_ref (fd); + local->size = size; + local->xattr_req = (dict)? dict_ref (dict) : NULL; + local->first_up_subvol = dht_first_up_subvol (this); + + dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + + /* TODO: do proper readdir */ + if (whichop == GF_FOP_READDIRP) { + if (dict) + local->xattr = dict_ref (dict); + else + local->xattr = dict_new (); + + if (local->xattr) { + ret = dict_set_uint32 (local->xattr, + conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to set '%s' key", + conf->link_xattr_name); + if (conf->readdir_optimize == _gf_true) { + if (xvol != local->first_up_subvol) { + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_log (this->name, + GF_LOG_ERROR, + "Dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } + } + } - /* TODO: do proper readdir */ - STACK_WIND (frame, dht_readdir_cbk, - xvol, xvol->fops->readdir, - fd, size, xoff); + STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp, + fd, size, xoff, local->xattr); + } else { + STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, + fd, size, xoff, local->xattr); + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; +} + + +int +dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *xdata) +{ + int op = GF_FOP_READDIR; + dht_conf_t *conf = NULL; + int i = 0; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + op = GF_FOP_READDIRP; + break; + } + } + + if (conf->use_readdirp) + op = GF_FOP_READDIRP; + +out: + dht_do_readdir (frame, this, fd, size, yoff, op, 0); + return 0; } +int +dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *dict) +{ + dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); + return 0; +} + + int dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (op_ret == -1) - local->op_errno = op_errno; + LOCK (&frame->lock); + { + if (op_ret == -1) + local->op_errno = op_errno; - if (op_ret == 0) - local->op_ret = 0; - } - UNLOCK (&frame->lock); + if (op_ret == 0) + local->op_ret = 0; + } + UNLOCK (&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, + local->op_errno, xdata); return 0; } int -dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int datasync, dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; int op_errno = -1; - int i = -1; - + int i = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); - conf = this->private; + conf = this->private; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + local = dht_local_init (frame, NULL, NULL, GF_FOP_FSYNCDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - local->fd = fd_ref (fd); - local->call_cnt = conf->subvolume_cnt; + local->fd = fd_ref (fd); + local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_fsyncdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->fsyncdir, - fd, datasync); - } + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_fsyncdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->fsyncdir, + fd, datasync, xdata); + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); - return 0; + return 0; } int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf) + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = -1; + xlator_t *prev = NULL; + int ret = -1; + dht_local_t *local = NULL; - if (op_ret == -1) - goto out; + if (op_ret == -1) + goto out; - prev = cookie; + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); - layout = dht_layout_for_subvol (this, prev->this); + prev = cookie; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + if (local->loc.parent) { - ret = inode_ctx_put (inode, this, (uint64_t)(long)layout); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set inode context"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + ret = dht_layout_preset (this, prev, inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "could not set pre-set layout for subvolume %s", + prev? prev->name: NULL); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); out: - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); - return 0; + /* + * FIXME: ia_size and st_blocks of preparent and postparent do not have + * correct values. since, preparent and postparent buffers correspond + * to a directory these two members should have values equal to sum of + * corresponding values from each of the subvolume. + * See dht_iatt_merge for reference. + */ + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); + return 0; } int dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *stbuf) + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; if (op_ret == -1) goto err; - local = frame->local; - cached_subvol = local->cached_subvol; + local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + goto err; + } - STACK_WIND (frame, dht_newfile_cbk, - cached_subvol, cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev); + cached_subvol = local->cached_subvol; + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol, + cached_subvol, cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, + local->params); + + return 0; +err: + DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; - err: - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - return 0; } int dht_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) { - xlator_t *subvol = NULL; - int op_errno = -1; - int ret = -1; + xlator_t *subvol = NULL; + int op_errno = -1; xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + dht_local_t *local = NULL; - conf = this->private; + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); dht_get_du_info (frame, this, loc); - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } if (!dht_is_subvol_filled (this, subvol)) { gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev); + + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, + subvol, subvol->fops->mknod, loc, mode, + rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol); + + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); if (avail_subvol != subvol) { - /* Choose the minimum filled volume, and create the + /* Choose the minimum filled volume, and create the files there */ - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + local->params = dict_ref (params); local->cached_subvol = avail_subvol; - local->mode = mode; + local->mode = mode; local->rdev = rdev; - - dht_linkfile_create (frame, + local->umask = umask; + dht_linkfile_create (frame, dht_mknod_linkfile_create_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); } else { gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev); + + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, + rdev, umask, params); } } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (mknod, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } int dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc) + const char *linkname, loc_t *loc, mode_t umask, dict_t *params) { - xlator_t *subvol = NULL; - int op_errno = -1; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + local = dht_local_init (frame, loc, NULL, GF_FOP_SYMLINK); + if (!local) { + op_errno = ENOMEM; + goto err; + } - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + gf_log (this->name, GF_LOG_TRACE, + "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->symlink, - linkname, loc); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->symlink, linkname, loc, umask, + params); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (link, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } int -dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - xlator_t *cached_subvol = NULL; - xlator_t *hashed_subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - cached_subvol = dht_subvol_get_cached (this, loc->inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; - hashed_subvol = dht_subvol_get_hashed (this, loc); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = EINVAL; - goto err; - } + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + if (dht_filter_loc_subvol_key (this, loc, &local->loc, + &cached_subvol)) { + gf_log (this->name, GF_LOG_INFO, + "unlinking %s on %s (given path %s)", + local->loc.path, cached_subvol->name, loc->path); + STACK_WIND (frame, dht_unlink_cbk, + cached_subvol, cached_subvol->fops->unlink, + &local->loc, xflag, xdata); + goto done; + } - local->call_cnt = 1; - if (hashed_subvol != cached_subvol) - local->call_cnt++; + local = dht_local_init (frame, loc, NULL, GF_FOP_UNLINK); + if (!local) { + op_errno = ENOMEM; - STACK_WIND (frame, dht_err_cbk, - cached_subvol, cached_subvol->fops->unlink, loc); + goto err; + } - if (hashed_subvol != cached_subvol) - STACK_WIND (frame, dht_err_cbk, - hashed_subvol, hashed_subvol->fops->unlink, loc); + hashed_subvol = dht_subvol_get_hashed (this, loc); + /* Dont fail unlink if hashed_subvol is NULL which can be the result + * of layout anomaly */ + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + loc->path); + } - return 0; + cached_subvol = local->cached_subvol; + if (!cached_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + local->flags = xflag; + if (hashed_subvol && hashed_subvol != cached_subvol) { + STACK_WIND (frame, dht_unlink_linkfile_cbk, + hashed_subvol, hashed_subvol->fops->unlink, loc, + xflag, xdata); + } else { + STACK_WIND (frame, dht_unlink_cbk, + cached_subvol, cached_subvol->fops->unlink, loc, + xflag, xdata); + } +done: + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } int dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf) + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; prev = cookie; - local = frame->local; + + local = frame->local; if (op_ret == -1) goto out; - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - stbuf->st_ino = local->loc.inode->ino; + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, NULL); - return 0; + return 0; } int dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf) + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *srcvol = NULL; - + dht_local_t *local = NULL; + xlator_t *srcvol = NULL; - if (op_ret == -1) - goto err; + if (op_ret == -1) + goto err; - local = frame->local; - srcvol = local->linkfile.srcvol; + local = frame->local; + srcvol = local->linkfile.srcvol; - STACK_WIND (frame, dht_link_cbk, - srcvol, srcvol->fops->link, - &local->loc, &local->loc2); + STACK_WIND (frame, dht_link_cbk, srcvol, srcvol->fops->link, + &local->loc, &local->loc2, xdata); - return 0; + return 0; err: - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, NULL); - return 0; + return 0; } int dht_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) -{ - xlator_t *cached_subvol = NULL; - xlator_t *hashed_subvol = NULL; - int op_errno = -1; - int ret = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (newloc, err); - - cached_subvol = dht_subvol_get_cached (this, oldloc->inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", oldloc->path); - op_errno = EINVAL; - goto err; - } + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; - hashed_subvol = dht_subvol_get_hashed (this, newloc); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - newloc->path); - op_errno = EINVAL; - goto err; - } + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (newloc, err); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + local = dht_local_init (frame, oldloc, NULL, GF_FOP_LINK); + if (!local) { + op_errno = ENOMEM; - ret = loc_copy (&local->loc, oldloc); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + goto err; + } - ret = loc_copy (&local->loc2, newloc); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + cached_subvol = local->cached_subvol; + if (!cached_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", oldloc->path); + op_errno = EINVAL; + goto err; + } - if (hashed_subvol != cached_subvol) { - dht_linkfile_create (frame, dht_link_linkfile_cbk, - cached_subvol, hashed_subvol, newloc); - } else { - STACK_WIND (frame, dht_link_cbk, - cached_subvol, cached_subvol->fops->link, - oldloc, newloc); - } + hashed_subvol = dht_subvol_get_hashed (this, newloc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + newloc->path); + op_errno = EINVAL; + goto err; + } - return 0; + ret = loc_copy (&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + goto err; + } + + if (hashed_subvol != cached_subvol) { + uuid_copy (local->gfid, oldloc->inode->gfid); + dht_linkfile_create (frame, dht_link_linkfile_cbk, this, + cached_subvol, hashed_subvol, newloc); + } else { + STACK_WIND (frame, dht_link_cbk, + cached_subvol, cached_subvol->fops->link, + oldloc, newloc, xdata); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - fd_t *fd, inode_t *inode, struct stat *stbuf) + int op_ret, int op_errno, + fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = -1; + call_frame_t *prev = NULL; + int ret = -1; + dht_local_t *local = NULL; - if (op_ret == -1) - goto out; + if (op_ret == -1) + goto out; - prev = cookie; + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); - layout = dht_layout_for_subvol (this, prev->this); + prev = cookie; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); - ret = inode_ctx_put (inode, this, (uint64_t)(long)layout); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set inode context"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + ret = dht_layout_preset (this, prev->this, inode); + if (ret != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "could not set preset layout for subvol %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: - DHT_STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf); - return 0; + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, + postparent, NULL); + return 0; } int dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *stbuf) + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; if (op_ret == -1) goto err; - local = frame->local; - cached_subvol = local->cached_subvol; + local = frame->local; + cached_subvol = local->cached_subvol; STACK_WIND (frame, dht_create_cbk, cached_subvol, cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd); + &local->loc, local->flags, local->mode, + local->umask, local->fd, local->params); return 0; - err: - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); - return 0; +err: + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; } int dht_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *params) { - int op_errno = -1; - int ret = -1; - xlator_t *subvol = NULL; - dht_conf_t *conf = NULL; + int op_errno = -1; + xlator_t *subvol = NULL; dht_local_t *local = NULL; xlator_t *avail_subvol = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - conf = this->private; + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } + local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + if (dht_filter_loc_subvol_key (this, loc, &local->loc, + &subvol)) { + gf_log (this->name, GF_LOG_INFO, + "creating %s on %s (got create on %s)", + local->loc.path, subvol->name, loc->path); + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + &local->loc, flags, mode, umask, fd, params); + goto done; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } if (!dht_is_subvol_filled (this, subvol)) { gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd); - } else { - /* Choose the minimum filled volume, and create the - files there */ - /* TODO */ - avail_subvol = dht_free_disk_available_subvol (this, subvol); - if (avail_subvol != subvol) { - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - local->fd = fd_ref (fd); - local->flags = flags; - local->mode = mode; - - local->cached_subvol = avail_subvol; - local->hashed_subvol = subvol; - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s (link at %s)", loc->path, - avail_subvol->name, subvol->name); - dht_linkfile_create (frame, - dht_create_linkfile_create_cbk, - avail_subvol, subvol, loc); - } else { - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, fd); - - } + loc, flags, mode, umask, fd, params); + goto done; } - - return 0; + /* Choose the minimum filled volume, and create the + files there */ + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); + if (avail_subvol != subvol) { + local->params = dict_ref (params); + local->flags = flags; + local->mode = mode; + local->umask = umask; + local->cached_subvol = avail_subvol; + local->hashed_subvol = subvol; + gf_log (this->name, GF_LOG_TRACE, + "creating %s on %s (link at %s)", loc->path, + avail_subvol->name, subvol->name); + dht_linkfile_create (frame, dht_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); + goto done; + } + gf_log (this->name, GF_LOG_TRACE, + "creating %s on %s", loc->path, subvol->name); + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, umask, fd, params); +done: + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); - return 0; + return 0; } int dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno) + xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + local = frame->local; + layout = local->selfheal.layout; - local = frame->local; - layout = local->selfheal.layout; + if (op_ret == 0) { + dht_layout_set (this, local->inode, layout); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->preparent, 0); - if (op_ret == 0) { - inode_ctx_put (local->inode, this, (uint64_t)(long)layout); - local->selfheal.layout = NULL; - local->stbuf.st_ino = local->st_ino; - } + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + } - DHT_STACK_UNWIND (frame, op_ret, op_errno, - local->inode, &local->stbuf); + DHT_STACK_UNWIND (mkdir, frame, op_ret, op_errno, + local->inode, &local->stbuf, &local->preparent, + &local->postparent, NULL); - return 0; + return 0; } int dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, struct stat *stbuf) + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - int ret = -1; - int subvol_filled = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + int ret = -1; + gf_boolean_t subvol_filled = _gf_false; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; - conf = this->private; - local = frame->local; - prev = cookie; - layout = local->layout; + local = frame->local; + prev = cookie; + layout = local->layout; subvol_filled = dht_is_subvol_filled (this, prev->this); - LOCK (&frame->lock); - { + LOCK (&frame->lock); + { if (subvol_filled && (op_ret != -1)) { ret = dht_layout_merge (this, layout, prev->this, -1, ENOSPC, NULL); } else { + if (op_ret == -1 && op_errno == EEXIST) + /* Very likely just a race between mkdir and + self-heal (from lookup of a concurrent mkdir + attempt). + Ignore error for now. layout setting will + anyways fail if this was a different (old) + pre-existing different directory. + */ + op_ret = 0; ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, NULL); } + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); - if (op_ret == -1) { - local->op_errno = op_errno; - goto unlock; - } - dht_stat_merge (this, &local->stbuf, stbuf, prev->this); - } + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preparent, preparent, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + } unlock: - UNLOCK (&frame->lock); + UNLOCK (&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - local->layout = NULL; - dht_selfheal_new_directory (frame, dht_mkdir_selfheal_cbk, - layout); - } + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + dht_selfheal_new_directory (frame, dht_mkdir_selfheal_cbk, + layout); + } return 0; } int -dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf) +dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + int ret = -1; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol = NULL; - local = frame->local; - prev = cookie; - layout = local->layout; - conf = this->private; - hashed_subvol = local->hashed_subvol; + VALIDATE_OR_GOTO (this->private, err); + + local = frame->local; + prev = cookie; + layout = local->layout; + conf = this->private; + hashed_subvol = local->hashed_subvol; + + if (uuid_is_null (local->loc.gfid) && !op_ret) + uuid_copy (local->loc.gfid, stbuf->ia_gfid); if (dht_is_subvol_filled (this, hashed_subvol)) ret = dht_layout_merge (this, layout, prev->this, @@ -2934,47 +4180,55 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, else ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, NULL); - - if (op_ret == -1) { - local->op_errno = op_errno; - goto err; - } - local->op_ret = 0; - dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + /* TODO: we may have to return from the function + if layout merge fails. For now, lets just log an error */ + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto err; + } + local->op_ret = 0; - local->st_ino = local->stbuf.st_ino; + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preparent, preparent, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, prev->this); - local->call_cnt = conf->subvolume_cnt - 1; - - if (local->call_cnt == 0) { - local->layout = NULL; - dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, - &local->loc, layout); - } - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == hashed_subvol) - continue; - STACK_WIND (frame, dht_mkdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->mkdir, - &local->loc, local->mode); - } - return 0; + local->call_cnt = conf->subvolume_cnt - 1; + + if (uuid_is_null (local->loc.gfid)) + uuid_copy (local->loc.gfid, stbuf->ia_gfid); + if (local->call_cnt == 0) { + dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, + &local->loc, layout); + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == hashed_subvol) + continue; + STACK_WIND (frame, dht_mkdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->mkdir, &local->loc, + local->mode, local->umask, local->params); + } + return 0; err: - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } -int + + int dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode) + loc_t *loc, mode_t mode, mode_t umask, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; int op_errno = -1; - int ret = -1; - xlator_t *hashed_subvol = NULL; + xlator_t *hashed_subvol = NULL; VALIDATE_OR_GOTO (frame, err); @@ -2982,129 +4236,234 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (this->private, err); - conf = this->private; + conf = this->private; dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } - - hashed_subvol = dht_subvol_get_hashed (this, loc); + local = dht_local_init (frame, loc, NULL, GF_FOP_MKDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - if (hashed_subvol == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "hashed subvol not found for %s", + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (hashed_subvol == NULL) { + gf_log (this->name, GF_LOG_DEBUG, + "hashed subvol not found for %s", loc->path); - op_errno = EINVAL; - goto err; - } - - local->hashed_subvol = hashed_subvol; - local->inode = inode_ref (loc->inode); - ret = loc_copy (&local->loc, loc); - local->mode = mode; + op_errno = EINVAL; + goto err; + } - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } + local->hashed_subvol = hashed_subvol; + local->mode = mode; + local->umask = umask; + local->params = dict_ref (params); + local->inode = inode_ref (loc->inode); - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } - STACK_WIND (frame, dht_mkdir_hashed_cbk, - hashed_subvol, - hashed_subvol->fops->mkdir, - loc, mode); + STACK_WIND (frame, dht_mkdir_hashed_cbk, + hashed_subvol, + hashed_subvol->fops->mkdir, + loc, mode, umask, params); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); - return 0; + return 0; } int dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; - local->layout = NULL; + local = frame->local; - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); - return 0; + return 0; +} + + +int +dht_rmdir_hashed_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + if (op_errno != ENOENT && op_errno != EACCES) { + local->need_selfheal = 1; + } + + + gf_log (this->name, GF_LOG_DEBUG, + "rmdir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto unlock; + } + + dht_iatt_merge (this, &local->preparent, preparent, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->need_selfheal) { + local->layout = + dht_layout_get (this, local->loc.inode); + + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; + + uuid_copy (local->gfid, local->loc.inode->gfid); + dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, + &local->loc, local->layout); + } else { + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->preparent, + 0); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->postparent, + 1); + } + + DHT_STACK_UNWIND (rmdir, frame, local->op_ret, + local->op_errno, &local->preparent, + &local->postparent, NULL); + } + } + + return 0; } int dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - uint64_t tmp_layout = 0; - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + int done = 0; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - local->op_ret = -1; - - if (op_errno != ENOENT) - local->need_selfheal = 1; - - gf_log (this->name, GF_LOG_DEBUG, - "rmdir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto unlock; - } - } + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + + if (op_errno != ENOENT && op_errno != EACCES) { + local->need_selfheal = 1; + } + + gf_log (this->name, GF_LOG_DEBUG, + "rmdir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto unlock; + } + + /* Track if rmdir succeeded on atleast one subvol*/ + local->fop_succeeded = 1; + dht_iatt_merge (this, &local->preparent, preparent, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + } unlock: - UNLOCK (&frame->lock); + UNLOCK (&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { - inode_ctx_get (local->loc.inode, this, - &tmp_layout); - layout = (dht_layout_t *)(long)tmp_layout; - - /* TODO: neater interface needed below */ - local->stbuf.st_mode = local->loc.inode->st_mode; - - dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, - &local->loc, layout); - } else { - DHT_STACK_UNWIND (frame, local->op_ret, - local->op_errno); - } - } + this_call_cnt = dht_frame_return (frame); + + /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */ + if (local->hashed_subvol && (this_call_cnt == 1)) { + done = 1; + } else if (!local->hashed_subvol && !this_call_cnt) { + done = 1; + } + + + if (done) { + if (local->need_selfheal && local->fop_succeeded) { + local->layout = + dht_layout_get (this, local->loc.inode); + + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; + + uuid_copy (local->gfid, local->loc.inode->gfid); + dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, + &local->loc, local->layout); + } else if (this_call_cnt) { + /* If non-hashed subvol's have responded, proceed */ + + local->need_selfheal = 0; + STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, + local->hashed_subvol, + local->hashed_subvol->fops->rmdir, + &local->loc, local->flags, NULL); + } else if (!this_call_cnt) { + /* All subvol's have responded, proceed */ + + if (local->loc.parent) { + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->preparent, + 0); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->postparent, + 1); + + } + + DHT_STACK_UNWIND (rmdir, frame, local->op_ret, + local->op_errno, &local->preparent, + &local->postparent, NULL); + } + } return 0; } @@ -3113,591 +4472,869 @@ unlock: int dht_rmdir_do (call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int i = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol = NULL; - conf = this->private; - local = frame->local; + VALIDATE_OR_GOTO (this->private, err); - if (local->op_ret == -1) - goto err; + conf = this->private; + local = frame->local; - local->call_cnt = conf->subvolume_cnt; + if (local->op_ret == -1) + goto err; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rmdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->rmdir, - &local->loc); - } + local->call_cnt = conf->subvolume_cnt; - return 0; + /* first remove from non-hashed_subvol */ + hashed_subvol = dht_subvol_get_hashed (this, &local->loc); + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_WARNING, "failed to get hashed " + "subvol for %s",local->loc.path); + } else { + local->hashed_subvol = hashed_subvol; + } + + /* When DHT has only 1 child */ + if (conf->subvolume_cnt == 1) { + STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, + conf->subvolumes[0], + conf->subvolumes[0]->fops->rmdir, + &local->loc, local->flags, NULL); + return 0; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (hashed_subvol && + (hashed_subvol == conf->subvolumes[i])) + continue; + + STACK_WIND (frame, dht_rmdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->rmdir, + &local->loc, local->flags, NULL); + } + + return 0; err: - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); - return 0; + DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); + return 0; } int -dht_rmdir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) +dht_rmdir_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *src = NULL; + call_frame_t *main_frame = NULL; + dht_local_t *main_local = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + src = prev->this; - local = frame->local; - prev = cookie; - - if (op_ret > 2) { - gf_log (this->name, GF_LOG_TRACE, - "readdir on %s for %s returned %d entries", - prev->this->name, local->loc.path, op_ret); - local->op_ret = -1; - local->op_errno = ENOTEMPTY; - } + main_frame = local->main_frame; + main_local = main_frame->local; - this_call_cnt = dht_frame_return (frame); + if (op_ret == 0) { + gf_log (this->name, GF_LOG_TRACE, + "unlinked linkfile %s on %s", + local->loc.path, src->name); + } else { + main_local->op_ret = -1; + main_local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "unlink of %s on %s failed (%s)", + local->loc.path, src->name, strerror (op_errno)); + } - if (is_last_call (this_call_cnt)) { - dht_rmdir_do (frame, this); - } + this_call_cnt = dht_frame_return (main_frame); + if (is_last_call (this_call_cnt)) + dht_rmdir_do (main_frame, this); - return 0; + DHT_STACK_DESTROY (frame); + return 0; } int -dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) +dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, struct iatt *parent) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *src = NULL; + call_frame_t *main_frame = NULL; + dht_local_t *main_local = NULL; + int this_call_cnt = 0; + dht_conf_t *conf = this->private; + local = frame->local; + prev = cookie; + src = prev->this; - local = frame->local; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "opendir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto err; - } + main_frame = local->main_frame; + main_local = main_frame->local; + + if (op_ret != 0) + goto err; - STACK_WIND (frame, dht_rmdir_readdir_cbk, - prev->this, prev->this->fops->readdir, - local->fd, 4096, 0); + if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { + main_local->op_ret = -1; + main_local->op_errno = ENOTEMPTY; - return 0; + gf_log (this->name, GF_LOG_WARNING, + "%s on %s found to be not a linkfile (type=0%o)", + local->loc.path, src->name, stbuf->ia_type); + goto err; + } + STACK_WIND (frame, dht_rmdir_linkfile_unlink_cbk, + src, src->fops->unlink, &local->loc, 0, NULL); + return 0; err: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_rmdir_do (frame, this); - } + this_call_cnt = dht_frame_return (main_frame); + if (is_last_call (this_call_cnt)) + dht_rmdir_do (main_frame, this); - return 0; + DHT_STACK_DESTROY (frame); + return 0; } int -dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_rmdir_cached_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *parent) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; - int ret = -1; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + dht_local_t *local = NULL; + xlator_t *src = NULL; + call_frame_t *main_frame = NULL; + dht_local_t *main_local = NULL; + int this_call_cnt = 0; + dht_conf_t *conf = this->private; + dict_t *xattrs = NULL; + int ret = 0; - conf = this->private; + local = frame->local; + src = local->hashed_subvol; - local = dht_local_init (frame); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } + main_frame = local->main_frame; + main_local = main_frame->local; - local->call_cnt = conf->subvolume_cnt; - local->op_ret = 0; + if (op_ret == 0) { + main_local->op_ret = -1; + main_local->op_errno = ENOTEMPTY; - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } + gf_log (this->name, GF_LOG_WARNING, + "%s found on cached subvol %s", + local->loc.path, src->name); + goto err; + } else if (op_errno != ENOENT) { + main_local->op_ret = -1; + main_local->op_errno = op_errno; + goto err; + } - local->fd = fd_create (local->loc.inode, frame->root->pid); - if (!local->fd) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } + xattrs = dict_new (); + if (!xattrs) { + gf_log (this->name, GF_LOG_ERROR, "dict_new failed"); + goto err; + } - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rmdir_opendir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - loc, local->fd); - } + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key" + " in dict"); + if (xattrs) + dict_unref (xattrs); + goto err; + } - return 0; + STACK_WIND (frame, dht_rmdir_lookup_cbk, + src, src->fops->lookup, &local->loc, xattrs); + if (xattrs) + dict_unref (xattrs); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); - - return 0; -} + this_call_cnt = dht_frame_return (main_frame); + if (is_last_call (this_call_cnt)) + dht_rmdir_do (main_frame, this); -int -dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - DHT_STACK_UNWIND (frame, op_ret, op_errno, dict); - return 0; + DHT_STACK_DESTROY (frame); + return 0; } int -dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict) +dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, + gf_dirent_t *entries, xlator_t *src) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + int ret = 0; + int build_ret = 0; + gf_dirent_t *trav = NULL; + call_frame_t *lookup_frame = NULL; + dht_local_t *lookup_local = NULL; + dht_local_t *local = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = this->private; + xlator_t *subvol = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + local = frame->local; - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + list_for_each_entry (trav, &entries->list, list) { + if (strcmp (trav->d_name, ".") == 0) + continue; + if (strcmp (trav->d_name, "..") == 0) + continue; + if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict, + conf->link_xattr_name)) { + ret++; + continue; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + /* this entry is either a directory which is neither "." nor "..", + or a non directory which is not a linkfile. the directory is to + be treated as non-empty + */ + return 0; + } - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; + xattrs = dict_new (); + if (!xattrs) { + gf_log (this->name, GF_LOG_ERROR, "dict_new failed"); + return -1; + } - STACK_WIND (frame, - dht_xattrop_cbk, - subvol, subvol->fops->xattrop, - loc, flags, dict); + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key" + " in dict"); + if (xattrs) + dict_unref (xattrs); + return -1; + } - return 0; + list_for_each_entry (trav, &entries->list, list) { + if (strcmp (trav->d_name, ".") == 0) + continue; + if (strcmp (trav->d_name, "..") == 0) + continue; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + lookup_frame = NULL; + lookup_local = NULL; - return 0; -} + lookup_frame = copy_frame (frame); + if (!lookup_frame) { + /* out of memory, let the rmdir fail + (as non-empty, unfortunately) */ + goto err; + } + lookup_local = mem_get0 (this->local_pool); + if (!lookup_local) { + goto err; + } -int -dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - DHT_STACK_UNWIND (frame, op_ret, op_errno, dict); - return 0; -} + lookup_frame->local = lookup_local; + lookup_local->main_frame = frame; + lookup_local->hashed_subvol = src; + build_ret = dht_build_child_loc (this, &lookup_local->loc, + &local->loc, trav->d_name); + if (build_ret != 0) + goto err; -int -dht_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict) -{ - xlator_t *subvol = NULL; - int op_errno = -1; + uuid_copy (lookup_local->loc.gfid, trav->d_stat.ia_gfid); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + gf_log (this->name, GF_LOG_TRACE, + "looking up %s on %s", + lookup_local->loc.path, src->name); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + LOCK (&frame->lock); + { + local->call_cnt++; + } + UNLOCK (&frame->lock); - STACK_WIND (frame, - dht_fxattrop_cbk, - subvol, subvol->fops->fxattrop, - fd, flags, dict); + subvol = dht_linkfile_subvol (this, NULL, &trav->d_stat, + trav->dict); + if (!subvol) { + gf_log (this->name, GF_LOG_INFO, + "linkfile not having link subvolume. path=%s", + lookup_local->loc.path); + STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk, + src, src->fops->lookup, + &lookup_local->loc, xattrs); + } else { + STACK_WIND (lookup_frame, dht_rmdir_cached_lookup_cbk, + subvol, subvol->fops->lookup, + &lookup_local->loc, xattrs); + } + ret++; + } - return 0; + if (xattrs) + dict_unref (xattrs); + return ret; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + if (xattrs) + dict_unref (xattrs); - return 0; + DHT_STACK_DESTROY (lookup_frame); + return 0; } int -dht_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - +dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { - DHT_STACK_UNWIND (frame, op_ret, op_errno); - return 0; + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + xlator_t *src = NULL; + int ret = 0; + + local = frame->local; + prev = cookie; + src = prev->this; + + if (op_ret > 2) { + ret = dht_rmdir_is_subvol_empty (frame, this, entries, src); + + switch (ret) { + case 0: /* non linkfiles exist */ + gf_log (this->name, GF_LOG_TRACE, + "readdir on %s for %s returned %d entries", + prev->this->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + break; + default: + /* @ret number of linkfiles are getting unlinked */ + gf_log (this->name, GF_LOG_TRACE, + "readdir on %s for %s found %d linkfiles", + prev->this->name, local->loc.path, ret); + break; + } + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rmdir_do (frame, this); + } + + return 0; } -int32_t -dht_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct flock *lock) +int +dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + dict_t *dict = NULL; + int ret = 0; + dht_conf_t *conf = this->private; + int i = 0; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + local = frame->local; + prev = cookie; - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + this_call_cnt = dht_frame_return (frame); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "opendir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + if (op_errno != ENOENT) { + local->op_ret = -1; + local->op_errno = op_errno; + } + goto err; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + if (!is_last_call (this_call_cnt)) + return 0; - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; + if (local->op_ret == -1) + goto err; - STACK_WIND (frame, - dht_inodelk_cbk, - subvol, subvol->fops->inodelk, - volume, loc, cmd, lock); + dict = dict_new (); + if (!dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } - return 0; + ret = dict_set_uint32 (dict, conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + local->loc.path, conf->link_xattr_name); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); + local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rmdir_readdirp_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->readdirp, + local->fd, 4096, 0, dict); + } - return 0; -} + if (dict) + dict_unref (dict); + return 0; -int -dht_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +err: + if (is_last_call (this_call_cnt)) { + dht_rmdir_do (frame, this); + } -{ - DHT_STACK_UNWIND (frame, op_ret, op_errno); - return 0; + return 0; } int -dht_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct flock *lock) +dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; int op_errno = -1; + int i = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (this->private, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + conf = this->private; + local = dht_local_init (frame, loc, NULL, GF_FOP_RMDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - STACK_WIND (frame, - dht_finodelk_cbk, - subvol, subvol->fops->finodelk, - volume, fd, cmd, lock); + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + local->fop_succeeded = 0; - return 0; + local->flags = flags; + + local->fd = fd_create (local->loc.inode, frame->root->pid); + if (!local->fd) { + + op_errno = ENOMEM; + goto err; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rmdir_opendir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + loc, local->fd, NULL); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (rmdir, frame, -1, op_errno, + NULL, NULL, NULL); - return 0; + return 0; } - int dht_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (frame, op_ret, op_errno); - return 0; + DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno, xdata); + return 0; } int dht_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - xlator_t *subvol = NULL; + xlator_t *subvol = NULL; int op_errno = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK); + if (!local) { + op_errno = ENOMEM; + goto err; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; + local->call_cnt = 1; - STACK_WIND (frame, dht_entrylk_cbk, - subvol, subvol->fops->entrylk, - volume, loc, basename, cmd, type); + STACK_WIND (frame, dht_entrylk_cbk, + subvol, subvol->fops->entrylk, + volume, loc, basename, cmd, type, xdata); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); - return 0; + return 0; } int dht_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (frame, op_ret, op_errno); - return 0; + DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, NULL); + return 0; } int dht_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - xlator_t *subvol = NULL; + xlator_t *subvol = NULL; int op_errno = -1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } - STACK_WIND (frame, dht_fentrylk_cbk, - subvol, subvol->fops->fentrylk, - volume, fd, basename, cmd, type); + STACK_WIND (frame, dht_fentrylk_cbk, + subvol, subvol->fops->fentrylk, + volume, fd, basename, cmd, type, xdata); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); - return 0; + return 0; } int dht_forget (xlator_t *this, inode_t *inode) { - uint64_t tmp_layout = 0; - dht_layout_t *layout = NULL; + uint64_t ctx_int = 0; + dht_inode_ctx_t *ctx = NULL; + dht_layout_t *layout = NULL; - inode_ctx_get (inode, this, &tmp_layout); + inode_ctx_del (inode, this, &ctx_int); - if (!tmp_layout) - return 0; + if (!ctx_int) + return 0; - layout = (dht_layout_t *)(long)tmp_layout; - if (!layout->preset) - FREE (layout); + ctx = (dht_inode_ctx_t *) (long) ctx_int; - return 0; -} + layout = ctx->layout; + ctx->layout = NULL; + dht_layout_unref (this, layout); + GF_FREE (ctx); + return 0; +} int -dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) +dht_notify (xlator_t *this, int event, void *data, ...) { - xlator_list_t *subvols = NULL; - int cnt = 0; + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + int propagate = 0; + + int had_heard_from_all = 0; + int have_heard_from_all = 0; + struct timeval time = {0,}; + gf_defrag_info_t *defrag = NULL; + dict_t *dict = NULL; + gf_defrag_type cmd = 0; + dict_t *output = NULL; + va_list ap; - for (subvols = this->children; subvols; subvols = subvols->next) - cnt++; - - conf->subvolumes = CALLOC (cnt, sizeof (xlator_t *)); - if (!conf->subvolumes) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return -1; + conf = this->private; + if (!conf) + return ret; + + /* had all subvolumes reported status once till now? */ + had_heard_from_all = 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->last_event[i]) { + had_heard_from_all = 0; + } } - conf->subvolume_cnt = cnt; - cnt = 0; - for (subvols = this->children; subvols; subvols = subvols->next) - conf->subvolumes[cnt++] = subvols->xlator; + switch (event) { + case GF_EVENT_CHILD_UP: + subvol = data; - conf->subvolume_status = CALLOC (cnt, sizeof (char)); - if (!conf->subvolume_status) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return -1; - } + conf->gen++; - return 0; -} + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } + if (cnt == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "got GF_EVENT_CHILD_UP bad subvolume %s", + subvol->name); + break; + } -int -dht_notify (xlator_t *this, int event, void *data, ...) -{ - xlator_t *subvol = NULL; - int cnt = -1; - int i = -1; - dht_conf_t *conf = NULL; - int ret = -1; + gettimeofday (&time, NULL); + LOCK (&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 1; + conf->last_event[cnt] = event; + conf->subvol_up_time[cnt] = time.tv_sec; + } + UNLOCK (&conf->subvolume_lock); + /* one of the node came back up, do a stat update */ + dht_get_du_info_for_subvol (this, cnt); - conf = this->private; + break; - switch (event) { - case GF_EVENT_CHILD_UP: - subvol = data; + case GF_EVENT_CHILD_MODIFIED: + subvol = data; - conf->gen++; + conf->gen++; + propagate = 1; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - cnt = i; - break; - } - } + break; - if (cnt == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "got GF_EVENT_CHILD_UP bad subvolume %s", - subvol->name); - break; - } + case GF_EVENT_CHILD_DOWN: + subvol = data; - LOCK (&conf->subvolume_lock); - { - conf->subvolume_status[cnt] = 1; - } - UNLOCK (&conf->subvolume_lock); + if (conf->assert_no_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "Received CHILD_DOWN. Exiting"); + if (conf->defrag) { + gf_defrag_stop (conf->defrag, + GF_DEFRAG_STATUS_FAILED, NULL); + } else { + kill (getpid(), SIGTERM); + } + } - /* one of the node came back up, do a stat update */ - dht_get_du_info_for_subvol (this, cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } - break; + if (cnt == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "got GF_EVENT_CHILD_DOWN bad subvolume %s", + subvol->name); + break; + } - case GF_EVENT_CHILD_DOWN: - subvol = data; + LOCK (&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 0; + conf->last_event[cnt] = event; + conf->subvol_up_time[cnt] = 0; + } + UNLOCK (&conf->subvolume_lock); - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - cnt = i; - break; - } - } + break; - if (cnt == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "got GF_EVENT_CHILD_DOWN bad subvolume %s", - subvol->name); - break; - } + case GF_EVENT_CHILD_CONNECTING: + subvol = data; - LOCK (&conf->subvolume_lock); - { - conf->subvolume_status[cnt] = 0; - } - UNLOCK (&conf->subvolume_lock); + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } - break; - } + if (cnt == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "got GF_EVENT_CHILD_CONNECTING bad subvolume %s", + subvol->name); + break; + } - ret = default_notify (this, event, data); + LOCK (&conf->subvolume_lock); + { + conf->last_event[cnt] = event; + } + UNLOCK (&conf->subvolume_lock); - return ret; + break; + case GF_EVENT_VOLUME_DEFRAG: + { + if (!conf->defrag) { + return ret; + } + defrag = conf->defrag; + + dict = data; + va_start (ap, data); + output = va_arg (ap, dict_t*); + + ret = dict_get_int32 (dict, "rebalance-command", + (int32_t*)&cmd); + if (ret) + return ret; + LOCK (&defrag->lock); + { + if (defrag->is_exiting) + goto unlock; + if (cmd == GF_DEFRAG_CMD_STATUS) + gf_defrag_status_get (defrag, output); + else if (cmd == GF_DEFRAG_CMD_STOP) + gf_defrag_stop (defrag, + GF_DEFRAG_STATUS_STOPPED, output); + } +unlock: + UNLOCK (&defrag->lock); + return 0; + break; + } + + default: + propagate = 1; + break; + } + + + /* have all subvolumes reported status once by now? */ + have_heard_from_all = 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->last_event[i]) + have_heard_from_all = 0; + } + + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) { + propagate = 1; + + } + + + if (!had_heard_from_all && have_heard_from_all) { + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; + } + + if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } + } + + /* rebalance is started with assert_no_child_down. So we do + * not need to handle CHILD_DOWN event here. + */ + if (conf->defrag) { + ret = gf_thread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); + if (ret) { + conf->defrag = NULL; + GF_FREE (conf->defrag); + kill (getpid(), SIGTERM); + } + } + } + + ret = 0; + if (propagate) + ret = default_notify (this, event, data); + + return ret; } +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (!ret && ctx) { + if (ctx->layout) { + if (layout) + *layout = ctx->layout; + ret = 0; + } else { + ret = -1; + } + } + + return ret; +} diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index f5c95b4cb..2ece28a61 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -22,223 +13,775 @@ #include "config.h" #endif +#include <regex.h> +#include <signal.h> + +#include "dht-mem-types.h" +#include "libxlator.h" +#include "syncop.h" + #ifndef _DHT_H #define _DHT_H +#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout" +#define GF_DHT_LOOKUP_UNHASHED_ON 1 +#define GF_DHT_LOOKUP_UNHASHED_AUTO 2 +#define DHT_PATHINFO_HEADER "DISTRIBUTE:" + +#include <fnmatch.h> typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno); + xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata); +typedef int (*dht_defrag_cbk_fn_t) (xlator_t *this, call_frame_t *frame, + int ret); struct dht_layout { - int cnt; - int preset; - int gen; - int type; + int spread_cnt; /* layout spread count per directory, + is controlled by 'setxattr()' with + special key */ + int cnt; + int preset; + int gen; + int type; + int ref; /* use with dht_conf_t->layout_lock */ + gf_boolean_t search_unhashed; struct { - int err; /* 0 = normal - -1 = dir exists and no xattr - >0 = dir lookup failed with errno - */ - uint32_t start; - uint32_t stop; - xlator_t *xlator; - } list[0]; + int err; /* 0 = normal + -1 = dir exists and no xattr + >0 = dir lookup failed with errno + */ + uint32_t start; + uint32_t stop; + xlator_t *xlator; + } list[]; }; -typedef struct dht_layout dht_layout_t; +typedef struct dht_layout dht_layout_t; + +struct dht_stat_time { + uint32_t atime; + uint32_t atime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; + uint32_t mtime; + uint32_t mtime_nsec; +}; + +typedef struct dht_stat_time dht_stat_time_t; +struct dht_inode_ctx { + dht_layout_t *layout; + dht_stat_time_t time; +}; + +typedef struct dht_inode_ctx dht_inode_ctx_t; + + +typedef enum { + DHT_HASH_TYPE_DM, + DHT_HASH_TYPE_DM_USER, +} dht_hashfn_type_t; + +/* rebalance related */ +struct dht_rebalance_ { + xlator_t *from_subvol; + xlator_t *target_node; + off_t offset; + size_t size; + int32_t flags; + int count; + struct iobref *iobref; + struct iovec *vector; + struct iatt stbuf; + dht_defrag_cbk_fn_t target_op_fn; + dict_t *xdata; +}; struct dht_local { - int call_cnt; - loc_t loc; - loc_t loc2; - int op_ret; - int op_errno; - int layout_mismatch; - struct stat stbuf; - struct statvfs statvfs; - fd_t *fd; - inode_t *inode; - dict_t *xattr; - dict_t *xattr_req; - dht_layout_t *layout; - size_t size; - ino_t st_ino; - xlator_t *src_hashed, *src_cached; - xlator_t *dst_hashed, *dst_cached; - xlator_t *cached_subvol; - xlator_t *hashed_subvol; - char need_selfheal; + int call_cnt; + loc_t loc; + loc_t loc2; + int op_ret; + int op_errno; + int layout_mismatch; + /* Use stbuf as the postbuf, when we require both + * pre and post attrs */ + struct iatt stbuf; + struct iatt prebuf; + struct iatt preoldparent; + struct iatt postoldparent; + struct iatt preparent; + struct iatt postparent; + struct statvfs statvfs; + fd_t *fd; + inode_t *inode; + dict_t *params; + dict_t *xattr; + dict_t *xattr_req; + dht_layout_t *layout; + size_t size; + ino_t ia_ino; + xlator_t *src_hashed, *src_cached; + xlator_t *dst_hashed, *dst_cached; + xlator_t *cached_subvol; + xlator_t *hashed_subvol; + char need_selfheal; int file_count; int dir_count; - struct { - fop_mknod_cbk_t linkfile_cbk; - struct stat stbuf; - loc_t loc; - inode_t *inode; - dict_t *xattr; - xlator_t *srcvol; - } linkfile; - struct { - uint32_t hole_cnt; - uint32_t overlaps_cnt; - uint32_t missing; - uint32_t down; - uint32_t misc; - dht_selfheal_dir_cbk_t dir_cbk; - dht_layout_t *layout; - } selfheal; - - /* needed by nufa */ - int32_t flags; - mode_t mode; - dev_t rdev; + call_frame_t *main_frame; + int fop_succeeded; + struct { + fop_mknod_cbk_t linkfile_cbk; + struct iatt stbuf; + loc_t loc; + inode_t *inode; + dict_t *xattr; + xlator_t *srcvol; + } linkfile; + struct { + uint32_t hole_cnt; + uint32_t overlaps_cnt; + uint32_t down; + uint32_t misc; + dht_selfheal_dir_cbk_t dir_cbk; + dht_layout_t *layout; + } selfheal; + uint32_t uid; + uint32_t gid; + + /* needed by nufa */ + int32_t flags; + mode_t mode; + dev_t rdev; + mode_t umask; + + /* need for file-info */ + char *xattr_val; + char *key; + + /* which xattr request? */ + char xsel[256]; + int32_t alloc_len; + + char *newpath; + + /* gfid related */ + uuid_t gfid; + + /*Marker Related*/ + struct marker_str marker; + + /* flag used to make sure we need to return estale in + {lookup,revalidate}_cbk */ + char return_estale; + char need_lookup_everywhere; + + glusterfs_fop_t fop; + + gf_boolean_t linked; + xlator_t *link_subvol; + + struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; + }; typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { double avail_percent; + double avail_inodes; uint64_t avail_space; uint32_t log; }; typedef struct dht_du dht_du_t; +enum gf_defrag_type { + GF_DEFRAG_CMD_START = 1, + GF_DEFRAG_CMD_STOP = 1 + 1, + GF_DEFRAG_CMD_STATUS = 1 + 2, + GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, + GF_DEFRAG_CMD_START_FORCE = 1 + 4, +}; +typedef enum gf_defrag_type gf_defrag_type; + +enum gf_defrag_status_t { + GF_DEFRAG_STATUS_NOT_STARTED, + GF_DEFRAG_STATUS_STARTED, + GF_DEFRAG_STATUS_STOPPED, + GF_DEFRAG_STATUS_COMPLETE, + GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, +}; +typedef enum gf_defrag_status_t gf_defrag_status_t; + +typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t; + +struct gf_defrag_pattern_list { + char path_pattern[256]; + uint64_t size; + gf_defrag_pattern_list_t *next; +}; + +struct gf_defrag_info_ { + uint64_t total_files; + uint64_t total_data; + uint64_t num_files_lookedup; + uint64_t total_failures; + uint64_t skipped; + gf_lock_t lock; + int cmd; + pthread_t th; + gf_defrag_status_t defrag_status; + struct rpc_clnt *rpc; + uint32_t connected; + uint32_t is_exiting; + pid_t pid; + inode_t *root_inode; + uuid_t node_uuid; + struct timeval start_time; + gf_boolean_t stats; + gf_defrag_pattern_list_t *defrag_pattern; +}; + +typedef struct gf_defrag_info_ gf_defrag_info_t; + struct dht_conf { - gf_lock_t subvolume_lock; + gf_lock_t subvolume_lock; int subvolume_cnt; xlator_t **subvolumes; - xlator_t *local_volume; /* Needed by NUFA */ - char *subvolume_status; - dht_layout_t **file_layouts; - dht_layout_t **dir_layouts; - dht_layout_t *default_dir_layout; - gf_boolean_t search_unhashed; - int gen; + char *subvolume_status; + int *last_event; + dht_layout_t **file_layouts; + dht_layout_t **dir_layouts; + gf_boolean_t search_unhashed; + int gen; dht_du_t *du_stats; - uint64_t min_free_disk; + double min_free_disk; + double min_free_inodes; char disk_unit; int32_t refresh_interval; gf_boolean_t unhashed_sticky_bit; - struct timeval last_stat_fetch; + struct timeval last_stat_fetch; + gf_lock_t layout_lock; + void *private; /* Can be used by wrapper xlators over + dht */ + gf_boolean_t use_readdirp; + char vol_uuid[UUID_SIZE + 1]; + gf_boolean_t assert_no_child_down; + time_t *subvol_up_time; + + /* This is the count used as the distribute layout for a directory */ + /* Will be a global flag to control the layout spread count */ + uint32_t dir_spread_cnt; + + /* to keep track of nodes which are decomissioned */ + xlator_t **decommissioned_bricks; + int decommission_in_progress; + int decommission_subvols_cnt; + + /* defrag related */ + gf_defrag_info_t *defrag; + + /* Request to filter directory entries in readdir request */ + + gf_boolean_t readdir_optimize; + + /* Support regex-based name reinterpretation. */ + regex_t rsync_regex; + gf_boolean_t rsync_regex_valid; + regex_t extra_regex; + gf_boolean_t extra_regex_valid; + + /* Support variable xattr names. */ + char *xattr_name; + char *link_xattr_name; + char *wild_xattr_name; }; typedef struct dht_conf dht_conf_t; struct dht_disk_layout { - uint32_t cnt; - uint32_t type; - struct { - uint32_t start; - uint32_t stop; - } list[1]; + uint32_t cnt; + uint32_t type; + struct { + uint32_t start; + uint32_t stop; + } list[1]; }; typedef struct dht_disk_layout dht_disk_layout_t; - -#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) -#define is_fs_root(loc) (strcmp (loc->path, "/") == 0) +typedef enum { + GF_DHT_MIGRATE_DATA, + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS, + GF_DHT_MIGRATE_HARDLINK, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS +} gf_dht_migrate_data_type_t; + +#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) -#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0) +#define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0) #define is_last_call(cnt) (cnt == 0) -#define DHT_LINKFILE_MODE (S_ISVTX) -#define check_is_linkfile(i,s,x) ((s->st_mode & ~S_IFMT) == DHT_LINKFILE_MODE) +#define DHT_MIGRATION_IN_PROGRESS 1 +#define DHT_MIGRATION_COMPLETED 2 + +#define check_is_linkfile(i,s,x,n) (IS_DHT_LINKFILE_MODE (s) && dict_get (x, n)) + +#define IS_DHT_MIGRATION_PHASE2(buf) ( \ + IA_ISREG ((buf)->ia_type) && \ + ((st_mode_from_ia ((buf)->ia_prot, (buf)->ia_type) & \ + ~S_IFMT) == DHT_LINKFILE_MODE)) + +#define IS_DHT_MIGRATION_PHASE1(buf) ( \ + IA_ISREG ((buf)->ia_type) && \ + ((buf)->ia_prot.sticky == 1) && \ + ((buf)->ia_prot.sgid == 1)) -#define check_is_dir(i,s,x) (S_ISDIR(s->st_mode)) +#define DHT_STRIP_PHASE1_FLAGS(buf) do { \ + if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \ + (buf)->ia_prot.sticky = 0; \ + (buf)->ia_prot.sgid = 0; \ + } \ + } while (0) + +#define dht_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE) + +#define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type)) #define layout_is_sane(layout) ((layout) && (layout->cnt > 0)) -#define DHT_STACK_UNWIND(frame, params ...) do { \ - dht_local_t *__local = NULL; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_UNWIND (frame, params); \ - dht_local_wipe (__local); \ - } while (0) - -#define DHT_STACK_DESTROY(frame) do { \ - dht_local_t *__local = NULL; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - dht_local_wipe (__local); \ - } while (0) - -dht_layout_t *dht_layout_new (xlator_t *this, int cnt); -dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); -dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); -xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, - const char *name); -int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); -int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, - uint32_t *misc_p); -int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, - xlator_t *subvol, loc_t *loc, dict_t *xattr); +#define DHT_STACK_UNWIND(fop, frame, params ...) do { \ + dht_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + dht_local_wipe (__xl, __local); \ + } while (0) + +#define DHT_STACK_DESTROY(frame) do { \ + dht_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + dht_local_wipe (__xl, __local); \ + } while (0) + +#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, inode, post) do {\ + int32_t sec = 0; \ + sec = new_sec; \ + LOCK (&inode->lock); \ + { \ + new_sec = max(new_sec, ctx_sec); \ + if (sec < new_sec) \ + new_nsec = ctx_nsec; \ + if (sec == new_sec) \ + new_nsec = max (new_nsec, ctx_nsec); \ + if (post) { \ + ctx_sec = new_sec; \ + ctx_nsec = new_nsec; \ + } \ + } \ + UNLOCK (&inode->lock); \ + } while (0) + +#define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) +dht_layout_t *dht_layout_new (xlator_t *this, int cnt); +dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); +dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, + const char *name); +int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); +int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, + uint32_t *misc_p, uint32_t *no_space_p); +int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, loc_t *loc, dict_t *xattr); xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, - struct stat *buf, dict_t *xattr); -int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, - xlator_t *subvol, loc_t *loc); + struct iatt *buf, dict_t *xattr); +int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc); int dht_layouts_init (xlator_t *this, dht_conf_t *conf); int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - int op_ret, int op_errno, dict_t *xattr); + int op_ret, int op_errno, dict_t *xattr); int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, - int pos, int32_t **disk_layout_p); -int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, int32_t *disk_layout); + int pos, int32_t **disk_layout_p); +int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, void *disk_layout_raw, int disk_layout_len); int dht_frame_return (call_frame_t *frame); -int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); +int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol, - uint64_t *x); + uint64_t *x); -void dht_local_wipe (dht_local_t *local); -dht_local_t *dht_local_init (call_frame_t *frame); -int dht_stat_merge (xlator_t *this, struct stat *to, struct stat *from, - xlator_t *subvol); +void dht_local_wipe (xlator_t *this, dht_local_t *local); +dht_local_t *dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, + glusterfs_fop_t fop); +int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from, + xlator_t *subvol); xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); -int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); +xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev); +int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); -int dht_hash_compute (int type, const char *name, uint32_t *hash_p); +int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p); -int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc); -int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); +int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, + xlator_t *fromvol, loc_t *loc); +int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); +int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); int -dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - loc_t *loc, dht_layout_t *layout); +dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); int dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - dht_layout_t *layout); + dht_layout_t *layout); int -dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - loc_t *loc, dht_layout_t *layout); +dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); int dht_layout_sort_volname (dht_layout_t *layout); -int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); - int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); -int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); +gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); +xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *layout); +int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); + +int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); +int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);; +void dht_layout_unref (xlator_t *this, dht_layout_t *layout); +dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); +xlator_t *dht_first_up_subvol (xlator_t *this); +xlator_t *dht_last_up_subvol (xlator_t *this); + +int dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); + +int dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, + xlator_t **subvol); + +int dht_rename_cleanup (call_frame_t *frame); +int dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); + +int dht_fix_directory_layout (call_frame_t *frame, + dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout); + +int dht_init_subvolumes (xlator_t *this, dht_conf_t *conf); + +/* migration/rebalance */ +int dht_start_rebalance_task (xlator_t *this, call_frame_t *frame); + +int dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame); +int dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame); + + +/* FOPS */ +int32_t dht_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req); + +int32_t dht_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc, dict_t *xdata); + +int32_t dht_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd, dict_t *xdata); + +int32_t dht_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset, dict_t *xdata); + +int32_t dht_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset, dict_t *xdata); + +int32_t dht_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask, dict_t *xdata); + +int32_t dht_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size, dict_t *xdata); + +int32_t dht_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata); + +int32_t dht_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); + +int32_t dht_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, int xflag, dict_t *xdata); + +int32_t dht_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, dict_t *xdata); + +int32_t dht_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *loc, mode_t umask, + dict_t *xdata); + +int32_t dht_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc, dict_t *xdata); + +int32_t dht_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc, dict_t *xdata); + +int32_t dht_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *params); + +int32_t dht_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata); + +int32_t dht_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int32_t dht_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset, + uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +int32_t dht_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd, dict_t *xdata); + +int32_t dht_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync, dict_t *xdata); + +int32_t dht_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd, dict_t *xdata); + +int32_t dht_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync, dict_t *xdata); + +int32_t dht_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc, dict_t *xdata); + +int32_t dht_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t dht_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name, dict_t *xdata); + +int32_t dht_fsetxattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t dht_fgetxattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + const char *name, dict_t *xdata); + +int32_t dht_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name, dict_t *xdata); +int32_t dht_fremovexattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + const char *name, dict_t *xdata); + +int32_t dht_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +int32_t dht_inodelk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +int32_t dht_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +int32_t dht_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +int32_t dht_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +int32_t dht_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, off_t off, dict_t *xdata); + +int32_t dht_readdirp (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, off_t off, dict_t *dict); + +int32_t dht_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict, dict_t *xdata); + +int32_t dht_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict, dict_t *xdata); + +int32_t dht_forget (xlator_t *this, inode_t *inode); +int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata); +int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); +int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, off_t len, dict_t *xdata); + +int32_t dht_init (xlator_t *this); +void dht_fini (xlator_t *this); +int dht_reconfigure (xlator_t *this, dict_t *options); +int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...); + +/* definitions for nufa/switch */ +int dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); +int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict); + +int +gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status, + dict_t *output); + +void* +gf_defrag_start (void *this); + +int32_t +gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, + struct iatt *stbuf); +int +dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag); +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, + dht_layout_t **layout_int); +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t* layout_int); +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t update_ctx); +void dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat); -int dht_layout_inode_set (xlator_t *this, xlator_t *subvol, inode_t *inode); -xlator_t *dht_first_up_subvol (xlator_t *this); +int dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx); +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx); +int +dht_dir_attr_heal (void *data); +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data); +int +dht_dir_has_layout (dict_t *xattr, char *name); +gf_boolean_t +dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); +xlator_t * +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +xlator_t * +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this); + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix); +int32_t +dht_priv_dump (xlator_t *this); +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode); + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); -#endif /* _DHT_H */ +#endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 468e88798..fe3955ecb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -33,52 +24,70 @@ #include <sys/time.h> -int +int dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) + int op_ret, int op_errno, struct statvfs *statvfs, + dict_t *xdata) { dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - call_frame_t *prev = NULL; + call_frame_t *prev = NULL; int this_call_cnt = 0; - int i = 0; - double percent = 0; - uint64_t bytes = 0; - - local = frame->local; - conf = this->private; - prev = cookie; - - if (op_ret == -1) - goto out; - - if (statvfs && statvfs->f_blocks) { - percent = (statvfs->f_bfree * 100) / statvfs->f_blocks; - bytes = (statvfs->f_bfree * statvfs->f_bsize); - } - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) - if (prev->this == conf->subvolumes[i]) { - conf->du_stats[i].avail_percent = percent; - conf->du_stats[i].avail_space = bytes; - gf_log (this->name, GF_LOG_DEBUG, - "on subvolume '%s': avail_percent is: " - "%.2f and avail_space is: %"PRIu64"", - prev->this->name, - conf->du_stats[i].avail_percent, - conf->du_stats[i].avail_space); - } - } - UNLOCK (&conf->subvolume_lock); + int i = 0; + double percent = 0; + double percent_inodes = 0; + uint64_t bytes = 0; - out: + conf = this->private; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "failed to get disk info from %s", prev->this->name); + goto out; + } + + if (statvfs && statvfs->f_blocks) { + percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; + bytes = (statvfs->f_bavail * statvfs->f_frsize); + } + + if (statvfs && statvfs->f_files) { + percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; + } else { + /* set percent inodes to 100 for dynamically allocated inode filesystems + this logic holds good so that, distribute has nothing to worry about + total inodes rather let the 'create()' to be scheduled on the hashed + subvol regardless of the total inodes. since we have no awareness on + loosing inodes this logic fits well + */ + percent_inodes = 100; + } + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) + if (prev->this == conf->subvolumes[i]) { + conf->du_stats[i].avail_percent = percent; + conf->du_stats[i].avail_space = bytes; + conf->du_stats[i].avail_inodes = percent_inodes; + gf_log (this->name, GF_LOG_DEBUG, + "on subvolume '%s': avail_percent is: " + "%.2f and avail_space is: %"PRIu64" " + "and avail_inodes is: %.2f", + prev->this->name, + conf->du_stats[i].avail_percent, + conf->du_stats[i].avail_space, + conf->du_stats[i].avail_inodes); + } + } + UNLOCK (&conf->subvolume_lock); + +out: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) DHT_STACK_DESTROY (frame); - return 0; + return 0; } int @@ -87,177 +96,319 @@ dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx) dht_conf_t *conf = NULL; call_frame_t *statfs_frame = NULL; dht_local_t *statfs_local = NULL; - call_pool_t *pool = NULL; + call_pool_t *pool = NULL; + loc_t tmp_loc = {0,}; conf = this->private; - pool = this->ctx->pool; - - statfs_frame = create_frame (this, pool); - if (!statfs_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - statfs_local = dht_local_init (statfs_frame); - if (!statfs_local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = 1; - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[subvol_idx], - conf->subvolumes[subvol_idx]->fops->statfs, - &tmp_loc); - - return 0; - err: + pool = this->ctx->pool; + + statfs_frame = create_frame (this, pool); + if (!statfs_frame) { + goto err; + } + + /* local->fop value is not used in this case */ + statfs_local = dht_local_init (statfs_frame, NULL, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + /* make it root gfid, should be enough to get the proper info back */ + tmp_loc.gfid[15] = 1; + + statfs_local->call_cnt = 1; + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx]->fops->statfs, + &tmp_loc, NULL); + + return 0; +err: if (statfs_frame) DHT_STACK_DESTROY (statfs_frame); - - return -1; + + return -1; } int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) { - int i = 0; + int i = 0; dht_conf_t *conf = NULL; call_frame_t *statfs_frame = NULL; dht_local_t *statfs_local = NULL; - struct timeval tv = {0,}; + struct timeval tv = {0,}; + loc_t tmp_loc = {0,}; conf = this->private; gettimeofday (&tv, NULL); - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - statfs_frame = copy_frame (frame); - if (!statfs_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - statfs_local = dht_local_init (statfs_frame); - if (!statfs_local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + /* make it root gfid, should be enough to get the proper + info back */ + tmp_loc.gfid[15] = 1; - loc_copy (&statfs_local->loc, loc); - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, - &tmp_loc); - } + if (tv.tv_sec > (conf->refresh_interval + + conf->last_stat_fetch.tv_sec)) { - conf->last_stat_fetch.tv_sec = tv.tv_sec; - } - return 0; + statfs_frame = copy_frame (frame); + if (!statfs_frame) { + goto err; + } + + /* In this case, 'local->fop' is not used */ + statfs_local = dht_local_init (statfs_frame, loc, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + statfs_local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, + &tmp_loc, NULL); + } + + conf->last_stat_fetch.tv_sec = tv.tv_sec; + } + return 0; err: if (statfs_frame) DHT_STACK_DESTROY (statfs_frame); - return -1; + return -1; } -int +gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) { - int i = 0; - int subvol_filled = 0; + int i = 0; dht_conf_t *conf = NULL; + gf_boolean_t subvol_filled_inodes = _gf_false; + gf_boolean_t subvol_filled_space = _gf_false; + gf_boolean_t is_subvol_filled = _gf_false; - conf = this->private; + conf = this->private; + + /* Check for values above specified percent or free disk */ + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + if (conf->disk_unit == 'p') { + if (conf->du_stats[i].avail_percent < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + + } else { + if (conf->du_stats[i].avail_space < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + } + if (conf->du_stats[i].avail_inodes < + conf->min_free_inodes) { + subvol_filled_inodes = _gf_true; + break; + } + } + } + } + UNLOCK (&conf->subvolume_lock); + + if (subvol_filled_space && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_WARNING, + "disk space on subvolume '%s' is getting " + "full (%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_percent)); + } + } + + if (subvol_filled_inodes && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_CRITICAL, + "inodes on subvolume '%s' are at " + "(%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_inodes)); + } + } + + is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); + + return is_subvol_filled; +} + + +/*Get the best subvolume to create the file in*/ +xlator_t * +dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *local) +{ + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; + + conf = this->private; + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); + goto out; + } + } else { + layout = dht_layout_ref (this, local->layout); + } - /* Check for values above specified percent or free disk */ LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } else { - if (conf->du_stats[i].avail_space < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } - } + { + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + layout); + if(!avail_subvol) + { + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, + subvol, + layout); + } + + } + UNLOCK (&conf->subvolume_lock); +out: + if (!avail_subvol) { + gf_log (this->name, + GF_LOG_DEBUG, + "no subvolume has enough free space and/or inodes\ + to create"); + avail_subvol = subvol; + } + + if (layout) + dht_layout_unref (this, layout); + return avail_subvol; +} + +static inline +int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +{ + int ret = -1; + int i = 0; + + if (!this || !layout) + goto out; + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; } } - UNLOCK (&conf->subvolume_lock); - - if (subvol_filled) { - if (!(conf->du_stats[i].log++ % GF_UNIVERSAL_ANSWER)) { - gf_log (this->name, GF_LOG_WARNING, - "disk space on subvolume '%s' is getting " - "full (%.2f %%), consider adding more nodes", - subvol->name, - (100 - conf->du_stats[i].avail_percent)); + ret = 0; +out: + return ret; +} + +/*Get subvolume which has both space and inodes more than the min criteria*/ +xlator_t * +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) +{ + int i = 0; + double max = 0; + double max_inodes = 0; + int ignore_subvol = 0; + + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + for(i=0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if ((conf->disk_unit == 'p') && + (conf->du_stats[i].avail_percent > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_percent > max)) { + max = conf->du_stats[i].avail_percent; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } + } + + if ((conf->disk_unit != 'p') && + (conf->du_stats[i].avail_space > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_space > max)) { + max = conf->du_stats[i].avail_space; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } } } - return subvol_filled; + return avail_subvol; } + +/* Get subvol which has atleast one inode and maximum space */ xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { int i = 0; - double max= 0; + double max = 0; + int ignore_subvol = 0; + xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; + dht_conf_t *conf = NULL; conf = this->private; - avail_subvol = subvol; - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent > max) { - max = conf->du_stats[i].avail_percent; - avail_subvol = conf->subvolumes[i]; - } - } else { - if (conf->du_stats[i].avail_space > max) { - max = conf->du_stats[i].avail_space; - avail_subvol = conf->subvolumes[i]; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if (conf->disk_unit == 'p') { + if ((conf->du_stats[i].avail_percent > max) + && (conf->du_stats[i].avail_inodes > 0 )) { + max = conf->du_stats[i].avail_percent; + avail_subvol = conf->subvolumes[i]; } - } + } else { + if ((conf->du_stats[i].avail_space > max) + && (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_space; + avail_subvol = conf->subvolumes[i]; + } + } } - UNLOCK (&conf->subvolume_lock); - - if (max < conf->min_free_disk) - avail_subvol = subvol; - if (avail_subvol == subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume has enough free space to create"); - } - return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c index 31cb828a4..656cf23a0 100644 --- a/xlators/cluster/dht/src/dht-hashfn.c +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -29,58 +20,92 @@ #include "hashfn.h" -typedef enum { - DHT_HASH_TYPE_DM, -} dht_hashfn_type_t; - - int dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) { - int ret = 0; - uint32_t hash = 0; - - switch (type) { - case DHT_HASH_TYPE_DM: - hash = gf_dm_hashfn (name, strlen (name)); - break; - default: - ret = -1; - break; - } - - if (ret == 0) { - *hash_p = hash; - } - - return ret; + int ret = 0; + uint32_t hash = 0; + + switch (type) { + case DHT_HASH_TYPE_DM: + case DHT_HASH_TYPE_DM_USER: + hash = gf_dm_hashfn (name, strlen (name)); + break; + default: + ret = -1; + break; + } + + if (ret == 0) { + *hash_p = hash; + } + + return ret; } -#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ - rsync_frndly_name = (char *) name; \ - if (name[0] == '.') { \ - char *dot = 0; \ - int namelen = 0; \ - \ - dot = strrchr (name, '.'); \ - if (dot && dot > (name + 1) && *(dot + 1)) { \ - namelen = (dot - name); \ - rsync_frndly_name = alloca (namelen); \ - strncpy (rsync_frndly_name, name + 1, \ - namelen); \ - rsync_frndly_name[namelen - 1] = 0; \ - } \ - } \ - } while (0); - +static inline +gf_boolean_t +dht_munge_name (const char *original, char *modified, size_t len, regex_t *re) +{ + regmatch_t matches[2]; + size_t new_len; + + if (regexec(re,original,2,matches,0) != REG_NOMATCH) { + if (matches[1].rm_so != -1) { + new_len = matches[1].rm_eo - matches[1].rm_so; + /* Equal would fail due to the NUL at the end. */ + if (new_len < len) { + memcpy (modified,original+matches[1].rm_so, + new_len); + modified[new_len] = '\0'; + return _gf_true; + } + } + } + + /* This is guaranteed safe because of how the dest was allocated. */ + strcpy(modified,original); + return _gf_false; +} int -dht_hash_compute (int type, const char *name, uint32_t *hash_p) +dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) { - char *rsync_friendly_name = NULL; - - MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); - - return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); + char *rsync_friendly_name = NULL; + dht_conf_t *priv = this->private; + size_t len = 0; + gf_boolean_t munged = _gf_false; + + /* + * It wouldn't be safe to use alloca in an inline function that doesn't + * actually get inlined, and it wouldn't be efficient to do a real + * allocation, so we use alloca here (if needed) and pass that to the + * inline. + */ + + if (priv->extra_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->extra_regex); + } + + if (!munged && priv->rsync_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->rsync_regex); + if (munged) { + gf_log (this->name, GF_LOG_DEBUG, + "munged down to %s", rsync_friendly_name); + } + } + + if (!munged) { + rsync_friendly_name = (char *)name; + } + + return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); } diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 10fc1ccda..f1dc5072f 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -27,177 +18,406 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) { - dht_local_t *local = NULL; - int this_call_cnt = -1; + dht_local_t *local = NULL; + int this_call_cnt = -1; + + if (!frame) + return -1; - if (!frame) - return -1; + local = frame->local; + + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + } + UNLOCK (&frame->lock); - local = frame->local; + return this_call_cnt; +} + + +static uint64_t +dht_bits_for (uint64_t num) +{ + uint64_t bits = 0, ctrl = 1; - LOCK (&frame->lock); - { - this_call_cnt = --local->call_cnt; + while (ctrl < num) { + ctrl *= 2; + bits ++; } - UNLOCK (&frame->lock); - return this_call_cnt; + return bits; } +/* + * A slightly "updated" version of the algorithm described in the commit log + * is used here. + * + * The only enhancement is that: + * + * - The number of bits used by the backend filesystem for HUGE d_off which + * is described as 63, and + * - The number of bits used by the d_off presented by the transformation + * upwards which is described as 64, are both made "configurable." + */ + + +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 + +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) + +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) { - dht_conf_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t y = 0; + dht_conf_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } + + conf = this->private; + if (!conf) + goto out; + max = conf->subvolume_cnt; + cnt = dht_subvol_cnt (this, subvol); - if (x == ((uint64_t) -1)) { - y = (uint64_t) -1; + if (max == 1) { + y = x; goto out; } - conf = this->private; + max_bits = dht_bits_for (max); - max = conf->subvolume_cnt; - cnt = dht_subvol_cnt (this, subvol); + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); - y = ((x * max) + cnt); + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); + } out: - if (y_p) - *y_p = y; + if (y_p) + *y_p = y; - return 0; + return 0; } +int +dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, + xlator_t **subvol) +{ + char *new_name = NULL; + char *new_path = NULL; + xlator_list_t *trav = NULL; + char key[1024] = {0,}; + int ret = 0; /* not found */ + + /* Why do other tasks if first required 'char' itself is not there */ + if (!new_loc || !loc || !loc->name || !strchr (loc->name, '@')) + goto out; + + trav = this->children; + while (trav) { + snprintf (key, 1024, "*@%s:%s", this->name, trav->xlator->name); + if (fnmatch (key, loc->name, FNM_NOESCAPE) == 0) { + new_name = GF_CALLOC(strlen (loc->name), + sizeof (char), + gf_common_mt_char); + if (!new_name) + goto out; + if (fnmatch (key, loc->path, FNM_NOESCAPE) == 0) { + new_path = GF_CALLOC(strlen (loc->path), + sizeof (char), + gf_common_mt_char); + if (!new_path) + goto out; + strncpy (new_path, loc->path, (strlen (loc->path) - + strlen (key) + 1)); + } + strncpy (new_name, loc->name, (strlen (loc->name) - + strlen (key) + 1)); + + if (new_loc) { + new_loc->path = ((new_path) ? new_path: + gf_strdup (loc->path)); + new_loc->name = new_name; + new_loc->inode = inode_ref (loc->inode); + new_loc->parent = inode_ref (loc->parent); + } + *subvol = trav->xlator; + ret = 1; /* success */ + goto out; + } + trav = trav->next; + } +out: + if (!ret) { + /* !success */ + GF_FREE (new_path); + GF_FREE (new_name); + } + return ret; +} int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, - uint64_t *x_p) + uint64_t *x_p) { - dht_conf_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t x = 0; - xlator_t *subvol = 0; + dht_conf_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + xlator_t *subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; + + if (!this->private) + return -1; + + conf = this->private; + max = conf->subvolume_cnt; + + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = dht_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); - conf = this->private; - max = conf->subvolume_cnt; + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; - cnt = y % max; - x = y / max; + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } - subvol = conf->subvolumes[cnt]; +out: + subvol = conf->subvolumes[cnt]; - if (subvol_p) - *subvol_p = subvol; + if (subvol_p) + *subvol_p = subvol; - if (x_p) - *x_p = x; + if (x_p) + *x_p = x; - return 0; + return 0; } void -dht_local_wipe (dht_local_t *local) +dht_local_wipe (xlator_t *this, dht_local_t *local) { - if (!local) - return; + if (!local) + return; - loc_wipe (&local->loc); - loc_wipe (&local->loc2); + loc_wipe (&local->loc); + loc_wipe (&local->loc2); - if (local->xattr) - dict_unref (local->xattr); + if (local->xattr) + dict_unref (local->xattr); - if (local->inode) - inode_unref (local->inode); + if (local->inode) + inode_unref (local->inode); - if (local->layout) - FREE (local->layout); + if (local->layout) { + dht_layout_unref (this, local->layout); + local->layout = NULL; + } - loc_wipe (&local->linkfile.loc); + loc_wipe (&local->linkfile.loc); - if (local->linkfile.xattr) - dict_unref (local->linkfile.xattr); + if (local->linkfile.xattr) + dict_unref (local->linkfile.xattr); - if (local->linkfile.inode) - inode_unref (local->linkfile.inode); + if (local->linkfile.inode) + inode_unref (local->linkfile.inode); - if (local->fd) { - fd_unref (local->fd); - local->fd = NULL; - } - - if (local->xattr_req) - dict_unref (local->xattr_req); + if (local->fd) { + fd_unref (local->fd); + local->fd = NULL; + } + + if (local->params) { + dict_unref (local->params); + local->params = NULL; + } + + if (local->xattr_req) + dict_unref (local->xattr_req); + + if (local->selfheal.layout) { + dht_layout_unref (this, local->selfheal.layout); + local->selfheal.layout = NULL; + } + + GF_FREE (local->newpath); - FREE (local); + GF_FREE (local->key); + + GF_FREE (local->rebalance.vector); + + if (local->rebalance.iobref) + iobref_unref (local->rebalance.iobref); + + mem_put (local); } dht_local_t * -dht_local_init (call_frame_t *frame) +dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + inode_t *inode = NULL; + int ret = 0; - /* TODO: use mem-pool */ - local = CALLOC (1, sizeof (*local)); + local = mem_get0 (THIS->local_pool); + if (!local) + goto out; - if (!local) - return NULL; + if (loc) { + ret = loc_copy (&local->loc, loc); + if (ret) + goto out; - local->op_ret = -1; - local->op_errno = EUCLEAN; + inode = loc->inode; + } - frame->local = local; + if (fd) { + local->fd = fd_ref (fd); + if (!inode) + inode = fd->inode; + } - return local; -} + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop = fop; + + if (inode) { + local->layout = dht_layout_get (frame->this, inode); + local->cached_subvol = dht_subvol_get_cached (frame->this, + inode); + } + frame->local = local; + +out: + if (ret) { + if (local) + mem_put (local); + local = NULL; + } + return local; +} -char * -basestr (const char *str) +xlator_t * +dht_first_up_subvol (xlator_t *this) { - char *basestr = NULL; + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + time_t time = 0; + + conf = this->private; + if (!conf) + goto out; - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvol_up_time[i]) { + if (!time) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; + } else if (time > conf->subvol_up_time[i]) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; + } + } + } + } + UNLOCK (&conf->subvolume_lock); - return basestr; +out: + return child; } xlator_t * -dht_first_up_subvol (xlator_t *this) +dht_last_up_subvol (xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_t *child = NULL; - int i = 0; - - conf = this->private; - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolume_status[i]) { - child = conf->subvolumes[i]; - break; - } - } - } - UNLOCK (&conf->subvolume_lock); - - return child; + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + + conf = this->private; + if (!conf) + goto out; + + LOCK (&conf->subvolume_lock); + { + for (i = conf->subvolume_cnt-1; i >= 0; i--) { + if (conf->subvolume_status[i]) { + child = conf->subvolumes[i]; + break; + } + } + } + UNLOCK (&conf->subvolume_lock); + +out: + return child; } xlator_t * @@ -206,17 +426,23 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc) dht_layout_t *layout = NULL; xlator_t *subvol = NULL; - if (is_fs_root (loc)) { + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + if (__is_root_gfid (loc->gfid)) { subvol = dht_first_up_subvol (this); goto out; } + GF_VALIDATE_OR_GOTO (this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO (this->name, loc->name, out); + layout = dht_layout_get (this, loc->parent); if (!layout) { gf_log (this->name, GF_LOG_DEBUG, - "layout missing path=%s parent=%"PRId64, - loc->path, loc->parent->ino); + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); goto out; } @@ -230,6 +456,10 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc) } out: + if (layout) { + dht_layout_unref (this, layout); + } + return subvol; } @@ -240,6 +470,8 @@ dht_subvol_get_cached (xlator_t *this, inode_t *inode) dht_layout_t *layout = NULL; xlator_t *subvol = NULL; + GF_VALIDATE_OR_GOTO (this->name, this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); layout = dht_layout_get (this, inode); @@ -247,9 +479,13 @@ dht_subvol_get_cached (xlator_t *this, inode_t *inode) goto out; } - subvol = layout->list[0].xlator; + subvol = layout->list[0].xlator; out: + if (layout) { + dht_layout_unref (this, layout); + } + return subvol; } @@ -257,70 +493,714 @@ out: xlator_t * dht_subvol_next (xlator_t *this, xlator_t *prev) { - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *next = NULL; - - conf = this->private; - - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == prev) { - if ((i + 1) < conf->subvolume_cnt) - next = conf->subvolumes[i + 1]; - break; - } - } + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; - return next; + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + if ((i + 1) < conf->subvolume_cnt) + next = conf->subvolumes[i + 1]; + break; + } + } + +out: + return next; } +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } + +out: + return next; +} int dht_subvol_cnt (xlator_t *this, xlator_t *subvol) { - int i = 0; - int ret = -1; - dht_conf_t *conf = NULL; + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; + conf = this->private; + if (!conf) + goto out; - conf = this->private; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + ret = i; + break; + } + } - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - ret = i; - break; - } - } +out: + return ret; +} + + +#define set_if_greater(a, b) do { \ + if ((a) < (b)) \ + (a) = (b); \ + } while (0) + + +#define set_if_greater_time(a, an, b, bn) do { \ + if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))){ \ + (a) = (b); \ + (an) = (bn); \ + } \ + } while (0) \ + + +int +dht_iatt_merge (xlator_t *this, struct iatt *to, + struct iatt *from, xlator_t *subvol) +{ + if (!from || !to) + return 0; + + to->ia_dev = from->ia_dev; + + uuid_copy (to->ia_gfid, from->ia_gfid); + + to->ia_ino = from->ia_ino; + to->ia_prot = from->ia_prot; + to->ia_type = from->ia_type; + to->ia_nlink = from->ia_nlink; + to->ia_rdev = from->ia_rdev; + to->ia_size += from->ia_size; + to->ia_blksize = from->ia_blksize; + to->ia_blocks += from->ia_blocks; + + set_if_greater (to->ia_uid, from->ia_uid); + set_if_greater (to->ia_gid, from->ia_gid); + + set_if_greater_time(to->ia_atime, to->ia_atime_nsec, + from->ia_atime, from->ia_atime_nsec); + set_if_greater_time (to->ia_mtime, to->ia_mtime_nsec, + from->ia_mtime, from->ia_mtime_nsec); + set_if_greater_time (to->ia_ctime, to->ia_ctime_nsec, + from->ia_ctime, from->ia_ctime_nsec); + + return 0; +} + +int +dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + if (!child) { + goto err; + } + + if (strcmp (parent->path, "/") == 0) + gf_asprintf ((char **)&child->path, "/%s", name); + else + gf_asprintf ((char **)&child->path, "%s/%s", parent->path, name); + + if (!child->path) { + goto err; + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + + if (!child->inode) { + goto err; + } + + return 0; +err: + loc_wipe (child); + return -1; +} + + + +int +dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) +{ + xlator_list_t *subvols = NULL; + int cnt = 0; + + if (!conf) + return -1; + + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; + + conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->subvolumes) { + return -1; + } + conf->subvolume_cnt = cnt; + + cnt = 0; + for (subvols = this->children; subvols; subvols = subvols->next) + conf->subvolumes[cnt++] = subvols->xlator; + + conf->subvolume_status = GF_CALLOC (cnt, sizeof (char), + gf_dht_mt_char); + if (!conf->subvolume_status) { + return -1; + } + + conf->last_event = GF_CALLOC (cnt, sizeof (int), + gf_dht_mt_char); + if (!conf->last_event) { + return -1; + } + + conf->subvol_up_time = GF_CALLOC (cnt, sizeof (time_t), + gf_dht_mt_subvol_time); + if (!conf->subvol_up_time) { + return -1; + } + + conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), + gf_dht_mt_dht_du_t); + if (!conf->du_stats) { + return -1; + } + + conf->decommissioned_bricks = GF_CALLOC (cnt, sizeof (xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->decommissioned_bricks) { + return -1; + } - return ret; + return 0; } -#define set_if_greater(a, b) do { \ - if ((a) < (b)) \ - (a) = (b); \ - } while (0) + + +static int +dht_migration_complete_check_done (int op_ret, call_frame_t *frame, void *data) +{ + dht_local_t *local = NULL; + + local = frame->local; + + local->rebalance.target_op_fn (THIS, frame, op_ret); + + return 0; +} + int -dht_stat_merge (xlator_t *this, struct stat *to, - struct stat *from, xlator_t *subvol) +dht_migration_complete_check_task (void *data) { - to->st_dev = from->st_dev; + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + dht_layout_t *layout = NULL; + struct iatt stbuf = {0,}; + xlator_t *this = NULL; + call_frame_t *frame = NULL; + loc_t tmp_loc = {0,}; + char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) { + local->op_errno = EINVAL; + goto out; + } + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + + if (!local->loc.inode) { + ret = syncop_fgetxattr (src_node, local->fd, &dict, + conf->link_xattr_name); + } else { + SYNCTASK_SETID (0, 0); + ret = syncop_getxattr (src_node, &local->loc, &dict, + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } + + if (!ret) + dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); + + if (ret) { + if (!dht_inode_missing(-ret) || (!local->loc.inode)) { + local->op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to get the 'linkto' xattr %s", + local->loc.path, strerror (-ret)); + ret = -1; + goto out; + } + + /* Need to do lookup on hashed subvol, then get the file */ + ret = syncop_lookup (this, &local->loc, NULL, &stbuf, NULL, + NULL); + if (ret) { + local->op_errno = -ret; + ret = -1; + goto out; + } + + dst_node = dht_subvol_get_cached (this, local->loc.inode); + } + + if (!dst_node) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to get the destination node", + local->loc.path); + ret = -1; + local->op_errno = EINVAL; + goto out; + } - dht_itransform (this, subvol, from->st_ino, &to->st_ino); + /* lookup on dst */ + if (local->loc.inode) { + ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL, + NULL); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to lookup the file on %s", + local->loc.path, dst_node->name); + local->op_errno = -ret; + ret = -1; + goto out; + } + + if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) { + gf_log (this->name, GF_LOG_ERROR, + "%s: gfid different on the target file on %s", + local->loc.path, dst_node->name); + ret = -1; + local->op_errno = EIO; + goto out; + } + } - to->st_mode = from->st_mode; - to->st_nlink = from->st_nlink; - to->st_uid = from->st_uid; - to->st_gid = from->st_gid; - to->st_rdev = from->st_rdev; - to->st_size += from->st_size; - to->st_blksize = from->st_blksize; - to->st_blocks += from->st_blocks; + /* update inode ctx (the layout) */ + dht_layout_unref (this, local->layout); - set_if_greater (to->st_atime, from->st_atime); - set_if_greater (to->st_mtime, from->st_mtime); - set_if_greater (to->st_ctime, from->st_ctime); + ret = dht_layout_preset (this, dst_node, inode); + if (ret != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s: could not set preset layout for subvol %s", + local->loc.path, dst_node->name); + ret = -1; + local->op_errno = EINVAL; + goto out; + } + + layout = dht_layout_for_subvol (this, dst_node); + if (!layout) { + gf_log (this->name, GF_LOG_INFO, + "%s: no pre-set layout for subvolume %s", + local->loc.path, dst_node ? dst_node->name : "<nil>"); + ret = -1; + local->op_errno = EINVAL; + goto out; + } - return 0; + ret = dht_layout_set (this, inode, layout); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set the new layout", + local->loc.path); + local->op_errno = EINVAL; + goto out; + } + + local->cached_subvol = dst_node; + ret = 0; + + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) + goto out; + + if (list_empty (&inode->fd_list)) + goto out; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + + /* flags for open are stripped down to allow following the + * new location of the file, otherwise we can get EEXIST or + * truncate the file again as rebalance is moving the data */ + ret = syncop_open (dst_node, &tmp_loc, + (iter_fd->flags & + ~(O_CREAT | O_EXCL | O_TRUNC)), iter_fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + local->op_errno = -ret; + ret = -1; + } + } + GF_FREE (path); + + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; + goto out; + } + ret = 0; +out: + + return ret; +} + +int +dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame) +{ + int ret = -1; + + ret = synctask_new (this->ctx->env, dht_migration_complete_check_task, + dht_migration_complete_check_done, + frame, frame); + return ret; +} + +/* During 'in-progress' state, both nodes should have the file */ +static int +dht_inprogress_check_done (int op_ret, call_frame_t *sync_frame, void *data) +{ + dht_local_t *local = NULL; + + local = sync_frame->local; + + local->rebalance.target_op_fn (THIS, sync_frame, op_ret); + + return 0; +} + +static int +dht_rebalance_inprogress_task (void *data) +{ + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + char *path = NULL; + struct iatt stbuf = {0,}; + loc_t tmp_loc = {0,}; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + if (local->loc.inode) { + SYNCTASK_SETID (0, 0); + ret = syncop_getxattr (src_node, &local->loc, &dict, + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } else { + ret = syncop_fgetxattr (src_node, local->fd, &dict, + conf->link_xattr_name); + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to get the 'linkto' xattr %s", + local->loc.path, strerror (-ret)); + ret = -1; + goto out; + } + + dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); + if (!dst_node) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to get the 'linkto' xattr from dict", + local->loc.path); + ret = -1; + goto out; + } + + local->rebalance.target_node = dst_node; + + if (local->loc.inode) { + /* lookup on dst */ + ret = syncop_lookup (dst_node, &local->loc, NULL, + &stbuf, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to lookup the file on %s", + local->loc.path, dst_node->name); + ret = -1; + goto out; + } + + if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) { + gf_log (this->name, GF_LOG_ERROR, + "%s: gfid different on the target file on %s", + local->loc.path, dst_node->name); + ret = -1; + goto out; + } + } + + ret = 0; + + if (list_empty (&inode->fd_list)) + goto done; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID (0, 0); + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + + /* flags for open are stripped down to allow following the + * new location of the file, otherwise we can get EEXIST or + * truncate the file again as rebalance is moving the data */ + ret = syncop_open (dst_node, &tmp_loc, + (iter_fd->flags & + ~(O_CREAT | O_EXCL | O_TRUNC)), iter_fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + ret = -1; + open_failed = 1; + } + } + GF_FREE (path); + + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; + goto out; + } + +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set inode-ctx target file at %s", + local->loc.path, dst_node->name); + goto out; + } + + ret = 0; +out: + return ret; +} + +int +dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame) +{ + + int ret = -1; + + ret = synctask_new (this->ctx->env, dht_rebalance_inprogress_task, + dht_inprogress_check_done, + frame, frame); + return ret; +} + +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + if (!ret && ctx) { + ctx->layout = layout_int; + } else { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return ret; + ctx->layout = layout_int; + } + + ret = dht_inode_ctx_set (inode, this, ctx); + + return ret; +} + + +void +dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (ret) + return; + + time = &ctx->time; + + time->mtime = stat->ia_mtime; + time->mtime_nsec = stat->ia_mtime_nsec; + + time->ctime = stat->ia_ctime; + time->ctime_nsec = stat->ia_ctime_nsec; + + time->atime = stat->ia_atime; + time->atime_nsec = stat->ia_atime_nsec; + + return; +} + + +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t post) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO (this->name, stat, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (ret) { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return -1; + } + + time = &ctx->time; + + DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, + stat->ia_mtime, stat->ia_mtime_nsec, inode, post); + DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, + stat->ia_ctime, stat->ia_ctime_nsec, inode, post); + DHT_UPDATE_TIME(time->atime, time->atime_nsec, + stat->ia_atime, stat->ia_atime_nsec, inode, post); + + ret = dht_inode_ctx_set (inode, this, ctx); +out: + return 0; +} + +int +dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = inode_ctx_get (inode, this, &ctx_int); + + if (ret) + return ret; + + if (ctx) + *ctx = (dht_inode_ctx_t *) ctx_int; +out: + return ret; +} + +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; } diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c new file mode 100644 index 000000000..e8a9a7196 --- /dev/null +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -0,0 +1,1139 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.h" + +int dht_access2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_readv2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_attr2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_open2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_flush2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_lk2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_fsync2 (xlator_t *this, call_frame_t *frame, int ret); + +int +dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = 0; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (!op_ret || (local->call_cnt != 1)) + goto out; + + /* rebalance would have happened */ + local->rebalance.target_op_fn = dht_open2; + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + +out: + DHT_STACK_UNWIND (open, frame, op_ret, op_errno, local->fd, xdata); + + return 0; +} + +int +dht_open2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = ENOENT; + if (op_ret) + goto out; + + local->call_cnt = 2; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, + &local->loc, local->rebalance.flags, local->fd, + NULL); + return 0; + +out: + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, fd_t *fd, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, loc, fd, GF_FOP_OPEN); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local->rebalance.flags = flags; + local->call_cnt = 1; + + STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, + loc, flags, fd, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) +{ + xlator_t *subvol = 0; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + local->op_errno = op_errno; + /* Check if the rebalance phase2 is true */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { + /* Phase 2 of migration */ + local->rebalance.target_op_fn = dht_attr2; + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } else { + /* value is already set in fd_ctx, that means no need + to check for whether its complete or not. */ + dht_attr2 (this, frame, 0); + return 0; + } + } + +out: + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (stat, frame, op_ret, op_errno, stbuf, xdata); +err: + return 0; +} + +int +dht_attr2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + if (op_ret == -1) + goto out; + + subvol = local->cached_subvol; + local->call_cnt = 2; + + if (local->fop == GF_FOP_FSTAT) { + STACK_WIND (frame, dht_file_attr_cbk, subvol, + subvol->fops->fstat, local->fd, NULL); + } else { + STACK_WIND (frame, dht_file_attr_cbk, subvol, + subvol->fops->stat, &local->loc, NULL); + } + return 0; + +out: + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto unlock; + } + + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); +out: + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, + &local->stbuf, xdata); + } +err: + return 0; +} + +int +dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + + local = dht_local_init (frame, loc, NULL, GF_FOP_STAT); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (IA_ISREG (loc->inode->ia_type)) { + local->call_cnt = 1; + + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_file_attr_cbk, subvol, + subvol->fops->stat, loc, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->stat, + loc, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FSTAT); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (IA_ISREG (fd->inode->ia_type)) { + local->call_cnt = 1; + + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_file_attr_cbk, subvol, + subvol->fops->fstat, fd, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->fstat, + fd, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iovec *vector, int count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = 0; + inode_t *inode = NULL; + xlator_t *subvol = 0; + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + /* This is already second try, no need for re-check */ + if (local->call_cnt != 1) + goto out; + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) + goto out; + + local->op_errno = op_errno; + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { + /* File would be migrated to other node */ + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { + local->rebalance.target_op_fn = dht_readv2; + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } else { + /* value is already set in fd_ctx, that means no need + to check for whether its complete or not. */ + dht_readv2 (this, frame, 0); + return 0; + } + } + +out: + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; +} + +int +dht_readv2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + if (op_ret == -1) + goto out; + + local->call_cnt = 2; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_readv_cbk, subvol, subvol->fops->readv, + local->fd, local->rebalance.size, local->rebalance.offset, + local->rebalance.flags, NULL); + + return 0; + +out: + DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + return 0; +} + +int +dht_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, uint32_t flags, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_READ); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local->rebalance.offset = off; + local->rebalance.size = size; + local->rebalance.flags = flags; + local->call_cnt = 1; + + STACK_WIND (frame, dht_readv_cbk, + subvol, subvol->fops->readv, + fd, size, off, flags, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + + return 0; +} + +int +dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + int ret = -1; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (!prev || !prev->this) + goto out; + if (local->call_cnt != 1) + goto out; + if ((op_ret == -1) && (op_errno == ENOTCONN) && + IA_ISDIR(local->loc.inode->ia_type)) { + + subvol = dht_subvol_next_available (this, prev->this); + if (!subvol) + goto out; + + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, + &local->loc, local->rebalance.flags, NULL); + return 0; + } + if ((op_ret == -1) && dht_inode_missing(op_errno)) { + /* File would be migrated to other node */ + local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_access2; + ret = dht_rebalance_complete_check (frame->this, frame); + if (!ret) + return 0; + } + +out: + DHT_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); + return 0; +} + +int +dht_access2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + if (op_ret == -1) + goto out; + + local->call_cnt = 2; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, + &local->loc, local->rebalance.flags, NULL); + + return 0; + +out: + DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); + return 0; +} + + +int +dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_ACCESS); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mask; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, + loc, mask, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *subvol = 0; + + local = frame->local; + + local->op_errno = op_errno; + + if (local->call_cnt != 1) + goto out; + + /* If context is set, then send flush() it to the destination */ + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { + dht_flush2 (this, frame, 0); + return 0; + } + +out: + DHT_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + + return 0; +} + +int +dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND (frame, dht_flush_cbk, + subvol, subvol->fops->flush, local->fd, NULL); + + return 0; +} + + +int +dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FLUSH); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_flush_cbk, + subvol, subvol->fops->flush, fd, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (flush, frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + inode_t *inode = NULL; + xlator_t *subvol = 0; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + if (op_ret == -1 && !dht_inode_missing(op_errno)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + + local->op_errno = op_errno; + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { + local->rebalance.target_op_fn = dht_fsync2; + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + + ret = dht_rebalance_in_progress_check (this, frame); + } + + /* Check if the rebalance phase2 is true */ + if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + } + if (!ret) + return 0; + } else { + dht_fsync2 (this, frame, 0); + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (fsync, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + + return 0; +} + +int +dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, + local->fd, local->rebalance.flags, NULL); + + return 0; +} + +int +dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FSYNC); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + local->call_cnt = 1; + local->rebalance.flags = datasync; + + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, + fd, datasync, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to + indicate that lock migration happened on the fd, so we can consider it as + phase 2 of migration */ +int +dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct gf_flock *flock, dict_t *xdata) +{ + DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock, xdata); + + return 0; +} + + +int +dht_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int cmd, struct gf_flock *flock, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + /* TODO: for rebalance, we need to preserve the fop arguments */ + STACK_WIND (frame, dht_lk_cbk, subvol, subvol->fops->lk, fd, + cmd, flock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +/* Symlinks are currently not migrated, so no need for any check here */ +int +dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, const char *path, + struct iatt *stbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + if (op_ret == -1) + goto err; + + if (!local) { + op_ret = -1; + op_errno = EINVAL; + } + +err: + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, stbuf, xdata); + + return 0; +} + + +int +dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_READLINK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_readlink_cbk, + subvol, subvol->fops->readlink, + loc, size, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +/* Currently no translators on top of 'distribute' will be using + * below fops, hence not implementing 'migration' related checks + */ + +int +dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_XATTROP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, + dht_xattrop_cbk, + subvol, subvol->fops->xattrop, + loc, flags, dict, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +dht_fxattrop (call_frame_t *frame, xlator_t *this, + fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + dht_fxattrop_cbk, + subvol, subvol->fops->fxattrop, + fd, flags, dict, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int32_t +dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_INODELK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, + dht_inodelk_cbk, + subvol, subvol->fops->inodelk, + volume, loc, cmd, lock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + + STACK_WIND (frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk, + volume, fd, cmd, lock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c new file mode 100644 index 000000000..576f007e5 --- /dev/null +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -0,0 +1,1017 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.h" + +int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); + +int +dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + if (op_ret == -1 && !dht_inode_missing(op_errno)) { + goto out; + } + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + if (local->call_cnt != 1) { + /* preserve the modes of source */ + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + + local->rebalance.target_op_fn = dht_writev2; + + local->op_errno = op_errno; + /* Phase 2 of migration */ + if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + + ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_writev2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + + DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int +dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND (frame, dht_writev_cbk, + subvol, subvol->fops->writev, + local->fd, local->rebalance.vector, local->rebalance.count, + local->rebalance.offset, local->rebalance.flags, + local->rebalance.iobref, NULL); + + return 0; +} + +int +dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE); + if (!local) { + + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + + local->rebalance.vector = iov_dup (vector, count); + local->rebalance.offset = off; + local->rebalance.count = count; + local->rebalance.flags = flags; + local->rebalance.iobref = iobref_ref (iobref); + local->call_cnt = 1; + + STACK_WIND (frame, dht_writev_cbk, + subvol, subvol->fops->writev, + fd, vector, count, off, flags, iobref, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + +int +dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + + local->rebalance.target_op_fn = dht_truncate2; + + local->op_errno = op_errno; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + inode = (local->fd) ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { + dht_truncate2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (truncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + + +int +dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + inode_t *inode = NULL; + + local = frame->local; + + inode = local->fd ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + if (local->fop == GF_FOP_TRUNCATE) { + STACK_WIND (frame, dht_truncate_cbk, subvol, + subvol->fops->truncate, &local->loc, + local->rebalance.offset, NULL); + } else { + STACK_WIND (frame, dht_truncate_cbk, subvol, + subvol->fops->ftruncate, local->fd, + local->rebalance.offset, NULL); + } + + return 0; +} + +int +dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_TRUNCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_truncate_cbk, + subvol, subvol->fops->truncate, + loc, offset, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FTRUNCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_truncate_cbk, + subvol, subvol->fops->ftruncate, + fd, offset, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_fallocate2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_fallocate2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate, + local->fd, local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, NULL); + + return 0; +} + +int +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mode; + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fallocate_cbk, + subvol, subvol->fops->fallocate, + fd, mode, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_discard2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_discard2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (discard, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_zerofill2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = fd_ctx_get (local->fd, this, NULL); + if (!ret) { + dht_zerofill2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + uint64_t tmp_subvol = 0; + int ret = -1; + + local = frame->local; + + if (local->fd) + ret = fd_ctx_get (local->fd, this, &tmp_subvol); + if (!ret) + subvol = (xlator_t *)(long)tmp_subvol; + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + +/* handle cases of migration here for 'setattr()' calls */ +int +dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + local->rebalance.target_op_fn = dht_setattr2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* At the end of the migration process, whatever 'attr' we + have on source file will be migrated to destination file + in one shot, hence we don't need to check for in progress + state here (ie, PHASE1) */ +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (setattr, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + + return 0; +} + +int +dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + inode_t *inode = NULL; + + local = frame->local; + + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + if (local->fop == GF_FOP_SETATTR) { + STACK_WIND (frame, dht_file_setattr_cbk, subvol, + subvol->fops->setattr, &local->loc, + &local->rebalance.stbuf, local->rebalance.flags, + NULL); + } else { + STACK_WIND (frame, dht_file_setattr_cbk, subvol, + subvol->fops->fsetattr, local->fd, + &local->rebalance.stbuf, local->rebalance.flags, + NULL); + } + + return 0; +} + + +/* Keep the existing code same for all the cases other than regular file */ +int +dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + dht_iatt_merge (this, &local->prebuf, statpre, prev->this); + dht_iatt_merge (this, &local->stbuf, statpost, prev->this); + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->op_ret == 0) + dht_inode_ctx_time_set (local->loc.inode, this, + &local->stbuf); + DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->stbuf, xdata); + } + + return 0; +} + + +int +dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + int call_cnt = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + local = dht_local_init (frame, loc, NULL, GF_FOP_SETATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_DEBUG, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (IA_ISREG (loc->inode->ia_type)) { + /* in the regular file _cbk(), we need to check for + migration possibilities */ + local->rebalance.stbuf = *stbuf; + local->rebalance.flags = valid; + local->call_cnt = 1; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_file_setattr_cbk, subvol, + subvol->fops->setattr, + loc, stbuf, valid, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_setattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->setattr, + loc, stbuf, valid, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FSETATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_DEBUG, + "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (IA_ISREG (fd->inode->ia_type)) { + /* in the regular file _cbk(), we need to check for + migration possibilities */ + local->rebalance.stbuf = *stbuf; + local->rebalance.flags = valid; + local->call_cnt = 1; + subvol = local->cached_subvol; + + STACK_WIND (frame, dht_file_setattr_cbk, subvol, + subvol->fops->fsetattr, + fd, stbuf, valid, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_setattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fsetattr, + fd, stbuf, valid, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index f2e3de14b..e1a37b77c 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -34,587 +25,781 @@ #define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size)) - dht_layout_t * dht_layout_new (xlator_t *this, int cnt) { - dht_layout_t *layout = NULL; + dht_layout_t *layout = NULL; dht_conf_t *conf = NULL; - conf = this->private; - layout = CALLOC (1, layout_size (cnt)); - if (!layout) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto out; - } + layout = GF_CALLOC (1, layout_size (cnt), + gf_dht_mt_dht_layout_t); + if (!layout) { + goto out; + } - layout->cnt = cnt; - if (conf) + layout->type = DHT_HASH_TYPE_DM; + layout->cnt = cnt; + + if (conf) { + layout->spread_cnt = conf->dir_spread_cnt; layout->gen = conf->gen; + } + + layout->ref = 1; + out: - return layout; + return layout; } dht_layout_t * dht_layout_get (xlator_t *this, inode_t *inode) { - uint64_t layout = 0; - int ret = -1; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; - ret = inode_ctx_get (inode, this, &layout); + conf = this->private; + if (!conf) + goto out; + + LOCK (&conf->layout_lock); + { + dht_inode_ctx_layout_get (inode, this, &layout); + if (layout) { + layout->ref++; + } + } + UNLOCK (&conf->layout_lock); - return (dht_layout_t *)(long)layout; +out: + return layout; } -xlator_t * -dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) +int +dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) { - uint32_t hash = 0; - xlator_t *subvol = NULL; - int i = 0; - int ret = 0; - + dht_conf_t *conf = NULL; + int oldret = -1; + int ret = 0; + dht_layout_t *old_layout; - ret = dht_hash_compute (layout->type, name, &hash); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "hash computation failed for type=%d name=%s", - layout->type, name); - goto out; - } + conf = this->private; + if (!conf) + goto out; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].start <= hash - && layout->list[i].stop >= hash) { - subvol = layout->list[i].xlator; - break; - } - } + LOCK (&conf->layout_lock); + { + oldret = dht_inode_ctx_layout_get (inode, this, &old_layout); + layout->ref++; + dht_inode_ctx_layout_set (inode, this, layout); + } + UNLOCK (&conf->layout_lock); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume for hash (value) = %u", hash); - } + if (!oldret) { + dht_layout_unref (this, old_layout); + } out: - return subvol; + return ret; } -dht_layout_t * -dht_layout_for_subvol (xlator_t *this, xlator_t *subvol) +void +dht_layout_unref (xlator_t *this, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - dht_layout_t *layout = NULL; - int i = 0; + dht_conf_t *conf = NULL; + int ref = 0; + if (!layout || layout->preset || !this->private) + return; - conf = this->private; + conf = this->private; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == subvol) { - layout = conf->file_layouts[i]; - break; - } - } + LOCK (&conf->layout_lock); + { + ref = --layout->ref; + } + UNLOCK (&conf->layout_lock); - return layout; + if (!ref) + GF_FREE (layout); } -int -dht_layouts_init (xlator_t *this, dht_conf_t *conf) +dht_layout_t * +dht_layout_ref (xlator_t *this, dht_layout_t *layout) { - dht_layout_t *layout = NULL; - int i = 0; - int ret = -1; - - - conf->file_layouts = CALLOC (conf->subvolume_cnt, - sizeof (dht_layout_t *)); - if (!conf->file_layouts) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto out; - } + dht_conf_t *conf = NULL; - for (i = 0; i < conf->subvolume_cnt; i++) { - layout = dht_layout_new (this, 1); + if (layout->preset || !this->private) + return layout; - if (!layout) { - goto out; - } + conf = this->private; + LOCK (&conf->layout_lock); + { + layout->ref++; + } + UNLOCK (&conf->layout_lock); - layout->preset = 1; + return layout; +} - layout->list[0].xlator = conf->subvolumes[i]; - conf->file_layouts[i] = layout; - } +xlator_t * +dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) +{ + uint32_t hash = 0; + xlator_t *subvol = NULL; + int i = 0; + int ret = 0; + + + ret = dht_hash_compute (this, layout->type, name, &hash); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "hash computation failed for type=%d name=%s", + layout->type, name); + goto out; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].start <= hash + && layout->list[i].stop >= hash) { + subvol = layout->list[i].xlator; + break; + } + } + + if (!subvol) { + gf_log (this->name, GF_LOG_WARNING, + "no subvolume for hash (value) = %u", hash); + } - ret = 0; out: - return ret; + return subvol; } -int -dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, - int pos, int32_t **disk_layout_p) +dht_layout_t * +dht_layout_for_subvol (xlator_t *this, xlator_t *subvol) { - int ret = -1; - int32_t *disk_layout = NULL; - - disk_layout = CALLOC (5, sizeof (int)); - if (!disk_layout) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto out; - } + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int i = 0; - disk_layout[0] = hton32 (1); - disk_layout[1] = hton32 (layout->type); - disk_layout[2] = hton32 (layout->list[pos].start); - disk_layout[3] = hton32 (layout->list[pos].stop); + conf = this->private; + if (!conf) + goto out; - if (disk_layout_p) - *disk_layout_p = disk_layout; - ret = 0; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + layout = conf->file_layouts[i]; + break; + } + } out: - return ret; + return layout; } int -dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, int32_t *disk_layout) +dht_layouts_init (xlator_t *this, dht_conf_t *conf) { - int cnt = 0; - int type = 0; - int start_off = 0; - int stop_off = 0; + dht_layout_t *layout = NULL; + int i = 0; + int ret = -1; - /* TODO: assert disk_layout_ptr is of required length */ + if (!conf) + goto out; - cnt = ntoh32 (disk_layout[0]); - if (cnt != 1) { - gf_log (this->name, GF_LOG_DEBUG, - "disk layout has invalid count %d", cnt); - return -1; - } + conf->file_layouts = GF_CALLOC (conf->subvolume_cnt, + sizeof (dht_layout_t *), + gf_dht_mt_dht_layout_t); + if (!conf->file_layouts) { + goto out; + } - /* TODO: assert type is compatible */ - type = ntoh32 (disk_layout[1]); - start_off = ntoh32 (disk_layout[2]); - stop_off = ntoh32 (disk_layout[3]); + for (i = 0; i < conf->subvolume_cnt; i++) { + layout = dht_layout_new (this, 1); - layout->list[pos].start = start_off; - layout->list[pos].stop = stop_off; + if (!layout) { + goto out; + } - gf_log (this->name, GF_LOG_TRACE, - "merged to layout: %u - %u (type %d) from %s", - start_off, stop_off, type, - layout->list[pos].xlator->name); + layout->preset = 1; - return 0; + layout->list[0].xlator = conf->subvolumes[i]; + + conf->file_layouts[i] = layout; + } + + ret = 0; +out: + return ret; } int -dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - int op_ret, int op_errno, dict_t *xattr) +dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, + int pos, int32_t **disk_layout_p) { - int i = 0; - int ret = -1; - int err = -1; - int32_t *disk_layout = NULL; + int ret = -1; + int32_t *disk_layout = NULL; + disk_layout = GF_CALLOC (5, sizeof (int), + gf_dht_mt_int32_t); + if (!disk_layout) { + goto out; + } - if (op_ret != 0) { - err = op_errno; - } + disk_layout[0] = hton32 (1); + disk_layout[1] = hton32 (layout->type); + disk_layout[2] = hton32 (layout->list[pos].start); + disk_layout[3] = hton32 (layout->list[pos].stop); - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == NULL) { - layout->list[i].err = err; - layout->list[i].xlator = subvol; - break; - } - } + if (disk_layout_p) + *disk_layout_p = disk_layout; + else + GF_FREE (disk_layout); - if (op_ret != 0) { - ret = 0; - goto out; - } + ret = 0; + +out: + return ret; +} - if (xattr) { - /* during lookup and not mkdir */ - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", - VOID(&disk_layout)); - } - if (ret != 0) { - layout->list[i].err = -1; - gf_log (this->name, GF_LOG_TRACE, - "missing disk layout on %s. err = %d", - subvol->name, err); - ret = 0; - goto out; +int +dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, void *disk_layout_raw, int disk_layout_len) +{ + int cnt = 0; + int type = 0; + int start_off = 0; + int stop_off = 0; + int disk_layout[4]; + + if (!disk_layout_raw) { + gf_log (this->name, GF_LOG_CRITICAL, + "error no layout on disk for merge"); + return -1; } - ret = dht_disk_layout_merge (this, layout, i, disk_layout); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "layout merge from subvolume %s failed", - subvol->name); - goto out; + GF_ASSERT (disk_layout_len == sizeof (disk_layout)); + + memcpy (disk_layout, disk_layout_raw, disk_layout_len); + + cnt = ntoh32 (disk_layout[0]); + if (cnt != 1) { + gf_log (this->name, GF_LOG_ERROR, + "disk layout has invalid count %d", cnt); + return -1; + } + + type = ntoh32 (disk_layout[1]); + switch (type) { + case DHT_HASH_TYPE_DM_USER: + gf_log (this->name, GF_LOG_DEBUG, "found user-set layout"); + layout->type = type; + /* Fall through. */ + case DHT_HASH_TYPE_DM: + break; + default: + gf_log (this->name, GF_LOG_CRITICAL, + "Catastrophic error layout with unknown type found %d", + disk_layout[1]); + return -1; } - layout->list[i].err = 0; + + start_off = ntoh32 (disk_layout[2]); + stop_off = ntoh32 (disk_layout[3]); + + layout->list[pos].start = start_off; + layout->list[pos].stop = stop_off; + + gf_log (this->name, GF_LOG_TRACE, + "merged to layout: %u - %u (type %d) from %s", + start_off, stop_off, type, + layout->list[pos].xlator->name); + + return 0; +} + + +int +dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr) +{ + int i = 0; + int ret = -1; + int err = -1; + void *disk_layout_raw = NULL; + int disk_layout_len = 0; + dht_conf_t *conf = this->private; + + if (op_ret != 0) { + err = op_errno; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == NULL) { + layout->list[i].err = err; + layout->list[i].xlator = subvol; + break; + } + } + + if (op_ret != 0) { + ret = 0; + goto out; + } + + if (xattr) { + /* during lookup and not mkdir */ + ret = dict_get_ptr_and_len (xattr, conf->xattr_name, + &disk_layout_raw, &disk_layout_len); + } + + if (ret != 0) { + layout->list[i].err = 0; + gf_log (this->name, GF_LOG_TRACE, + "missing disk layout on %s. err = %d", + subvol->name, err); + ret = 0; + goto out; + } + + ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw, + disk_layout_len); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "layout merge from subvolume %s failed", + subvol->name); + goto out; + } + layout->list[i].err = 0; out: - return ret; + return ret; } void dht_layout_entry_swap (dht_layout_t *layout, int i, int j) { - uint32_t start_swap = 0; - uint32_t stop_swap = 0; - xlator_t *xlator_swap = 0; - int err_swap = 0; - - - start_swap = layout->list[i].start; - stop_swap = layout->list[i].stop; - xlator_swap = layout->list[i].xlator; - err_swap = layout->list[i].err; - - layout->list[i].start = layout->list[j].start; - layout->list[i].stop = layout->list[j].stop; - layout->list[i].xlator = layout->list[j].xlator; - layout->list[i].err = layout->list[j].err; - - layout->list[j].start = start_swap; - layout->list[j].stop = stop_swap; - layout->list[j].xlator = xlator_swap; - layout->list[j].err = err_swap; + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + xlator_t *xlator_swap = 0; + int err_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + xlator_swap = layout->list[i].xlator; + err_swap = layout->list[i].err; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + layout->list[i].xlator = layout->list[j].xlator; + layout->list[i].err = layout->list[j].err; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; + layout->list[j].xlator = xlator_swap; + layout->list[j].err = err_swap; +} + +void +dht_layout_range_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; } int64_t dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) { - return (strcmp (layout->list[i].xlator->name, - layout->list[j].xlator->name)); + return (strcmp (layout->list[i].xlator->name, + layout->list[j].xlator->name)); +} + + +gf_boolean_t +dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator) +{ + int i = 0; + + for (i = 0; i < layout->cnt; i++) { + /* Check if xlator is already part of layout, and layout is + * non-zero. */ + if (!strcmp (layout->list[i].xlator->name, xlator->name)) { + if (layout->list[i].start != layout->list[i].stop) + return _gf_true; + break; + } + } + return _gf_false; } int64_t dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) { - int64_t diff = 0; + int64_t diff = 0; - if (layout->list[i].err || layout->list[j].err) - diff = layout->list[i].err - layout->list[j].err; - else - diff = (int64_t) layout->list[i].start - - (int64_t) layout->list[j].start; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t) layout->list[i].stop + - (int64_t) layout->list[j].stop; + goto out; + } + diff = (int64_t) layout->list[i].start + - (int64_t) layout->list[j].start; - return diff; +out: + return diff; } int dht_layout_sort (dht_layout_t *layout) { - int i = 0; - int j = 0; - int64_t ret = 0; - - /* TODO: O(n^2) -- bad bad */ - - for (i = 0; i < layout->cnt - 1; i++) { - for (j = i + 1; j < layout->cnt; j++) { - ret = dht_layout_entry_cmp (layout, i, j); - if (ret > 0) - dht_layout_entry_swap (layout, i, j); - } - } + int i = 0; + int j = 0; + int64_t ret = 0; - return 0; + /* TODO: O(n^2) -- bad bad */ + + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp (layout, i, j); + if (ret > 0) + dht_layout_entry_swap (layout, i, j); + } + } + + return 0; } int dht_layout_sort_volname (dht_layout_t *layout) { - int i = 0; - int j = 0; - int64_t ret = 0; - - /* TODO: O(n^2) -- bad bad */ - - for (i = 0; i < layout->cnt - 1; i++) { - for (j = i + 1; j < layout->cnt; j++) { - ret = dht_layout_entry_cmp_volname (layout, i, j); - if (ret > 0) - dht_layout_entry_swap (layout, i, j); - } - } + int i = 0; + int j = 0; + int64_t ret = 0; + + /* TODO: O(n^2) -- bad bad */ - return 0; + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp_volname (layout, i, j); + if (ret > 0) + dht_layout_entry_swap (layout, i, j); + } + } + + return 0; } int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p) + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, + uint32_t *no_space_p) { - dht_conf_t *conf = NULL; - uint32_t holes = 0; - uint32_t overlaps = 0; - uint32_t missing = 0; - uint32_t down = 0; - uint32_t misc = 0; - uint32_t hole_cnt = 0; - uint32_t overlap_cnt = 0; - int i = 0; - int ret = 0; - uint32_t prev_stop = 0; - uint32_t last_stop = 0; - char is_virgin = 1; - - - conf = this->private; - - /* TODO: explain WTF is happening */ - - last_stop = layout->list[0].start - 1; - prev_stop = last_stop; - - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err) { - switch (layout->list[i].err) { - case -1: - case ENOENT: - missing++; - break; - case ENOTCONN: - down++; - break; - case ENOSPC: - down++; - break; - default: - misc++; - } - continue; - } - - is_virgin = 0; - - if ((prev_stop + 1) < layout->list[i].start) { - hole_cnt++; - holes += (layout->list[i].start - (prev_stop + 1)); - } - - if ((prev_stop + 1) > layout->list[i].start) { - overlap_cnt++; - overlaps += ((prev_stop + 1) - layout->list[i].start); - } - prev_stop = layout->list[i].stop; - } + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + uint32_t hole_cnt = 0; + uint32_t overlap_cnt = 0; + int i = 0; + int ret = 0; + uint32_t prev_stop = 0; + uint32_t last_stop = 0; + char is_virgin = 1; + uint32_t no_space = 0; + + /* This funtion scans through the layout spread of a directory to + check if there are any anomalies. Prior to calling this function + the layout entries should be sorted in the ascending order. + + If the layout entry has err != 0 + then increment the corresponding anomaly. + else + if (start of the current layout entry > stop + 1 of previous + non erroneous layout entry) + then it indicates a hole in the layout + if (start of the current layout entry < stop + 1 of previous + non erroneous layout entry) + then it indicates an overlap in the layout + */ + last_stop = layout->list[0].start - 1; + prev_stop = last_stop; + + for (i = 0; i < layout->cnt; i++) { + switch (layout->list[i].err) { + case -1: + case ENOENT: + case ESTALE: + missing++; + continue; + case ENOTCONN: + down++; + continue; + case ENOSPC: + no_space++; + continue; + case 0: + /* if err == 0 and start == stop, then it is a non misc++; + * participating subvolume(spread-cnt). Then, do not + * check for anomalies. If start != stop, then treat it + * as misc err */ + if (layout->list[i].start == layout->list[i].stop) { + continue; + } + break; + default: + misc++; + continue; + } + + is_virgin = 0; + + if ((prev_stop + 1) < layout->list[i].start) { + hole_cnt++; + } - if ((last_stop - prev_stop) || is_virgin) - hole_cnt++; - holes += (last_stop - prev_stop); + if ((prev_stop + 1) > layout->list[i].start) { + overlap_cnt++; + overlaps += ((prev_stop + 1) - layout->list[i].start); + } + prev_stop = layout->list[i].stop; + } + + if ((last_stop - prev_stop) || is_virgin) + hole_cnt++; + + if (holes_p) + *holes_p = hole_cnt; - if (holes_p) - *holes_p = hole_cnt; + if (overlaps_p) + *overlaps_p = overlap_cnt; - if (overlaps_p) - *overlaps_p = overlap_cnt; + if (missing_p) + *missing_p = missing; - if (missing_p) - *missing_p = missing; + if (down_p) + *down_p = down; - if (down_p) - *down_p = down; + if (misc_p) + *misc_p = misc; - if (misc_p) - *misc_p = misc; + if (no_space_p) + *no_space_p = no_space; - return ret; + return ret; } int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) { - int ret = 0; - int i = 0; - uint32_t holes = 0; - uint32_t overlaps = 0; - uint32_t missing = 0; - uint32_t down = 0; - uint32_t misc = 0; - - - ret = dht_layout_sort (layout); - if (ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "sort failed?! how the ...."); - goto out; - } - - ret = dht_layout_anomalies (this, loc, layout, - &holes, &overlaps, - &missing, &down, &misc); - if (ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "error while finding anomalies in %s -- not good news", - loc->path); - goto out; - } - - if (holes || overlaps) { - if (missing == layout->cnt) { - gf_log (this->name, GF_LOG_DEBUG, - "directory %s looked up first time", - loc->path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "found anomalies in %s. holes=%d overlaps=%d", - loc->path, holes, overlaps); - } - ret = 1; - } + int ret = 0; + int i = 0; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + + ret = dht_layout_sort (layout); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "sort failed?! how the ...."); + goto out; + } + + ret = dht_layout_anomalies (this, loc, layout, + &holes, &overlaps, + &missing, &down, &misc, NULL); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "error while finding anomalies in %s -- not good news", + loc->path); + goto out; + } + + if (holes || overlaps) { + if (missing == layout->cnt) { + gf_log (this->name, GF_LOG_DEBUG, + "directory %s looked up first time", + loc->path); + } else { + gf_log (this->name, GF_LOG_INFO, + "found anomalies in %s. holes=%d overlaps=%d", + loc->path, holes, overlaps); + } + ret = -1; + } + + for (i = 0; i < layout->cnt; i++) { + /* TODO During DHT selfheal rewrite (almost) find a better place + * to detect this - probably in dht_layout_anomalies() + */ + if (layout->list[i].err > 0) { + gf_log_callingfn (this->name, GF_LOG_DEBUG, + "path=%s err=%s on subvol=%s", + loc->path, + strerror (layout->list[i].err), + (layout->list[i].xlator ? + layout->list[i].xlator->name + : "<>")); + if ((layout->list[i].err == ENOENT) && (ret >= 0)) { + ret++; + } + } + } - for (i = 0; i < layout->cnt; i++) { - /* TODO During DHT selfheal rewrite (almost) find a better place to - * detect this - probably in dht_layout_anomalies() - */ - if (layout->list[i].err > 0) { - gf_log (this->name, GF_LOG_DEBUG, - "path=%s err=%s on subvol=%s", - loc->path, strerror (layout->list[i].err), - (layout->list[i].xlator ? - layout->list[i].xlator->name : "<>")); - if (layout->list[i].err == ENOENT) - ret = 1; - } - } out: - return ret; + return ret; } +int +dht_dir_has_layout (dict_t *xattr, char *name) +{ + + void *disk_layout_raw = NULL; + + return dict_get_ptr (xattr, name, &disk_layout_raw); +} int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - loc_t *loc, dict_t *xattr) + loc_t *loc, dict_t *xattr) { - int idx = 0; - int pos = -1; - int ret = 0; - int err = 0; - int dict_ret = 0; - int32_t *disk_layout = NULL; - int32_t count = -1; - uint32_t start_off = -1; - uint32_t stop_off = -1; - - - for (idx = 0; idx < layout->cnt; idx++) { - if (layout->list[idx].xlator == subvol) { - pos = idx; - break; - } - } - - if (pos == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s - no layout info for subvolume %s", - loc->path, subvol->name); - ret = 1; - goto out; - } + int idx = 0; + int pos = -1; + int ret = 0; + int err = 0; + int dict_ret = 0; + int32_t disk_layout[4]; + void *disk_layout_raw = NULL; + int32_t count = -1; + uint32_t start_off = -1; + uint32_t stop_off = -1; + dht_conf_t *conf = this->private; + + + for (idx = 0; idx < layout->cnt; idx++) { + if (layout->list[idx].xlator == subvol) { + pos = idx; + break; + } + } + + if (pos == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s - no layout info for subvolume %s", + loc->path, subvol->name); + ret = 1; + goto out; + } err = layout->list[pos].err; - if (!xattr) { + if (!xattr) { if (err == 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "%s - xattr dictionary is NULL", loc->path); ret = -1; } - goto out; - } + goto out; + } - dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", - VOID(&disk_layout)); - - if (dict_ret < 0) { - if (err == 0) { - gf_log (this->name, GF_LOG_DEBUG, + dict_ret = dict_get_ptr (xattr, conf->xattr_name, + &disk_layout_raw); + + if (dict_ret < 0) { + if (err == 0 && layout->list[pos].stop) { + gf_log (this->name, GF_LOG_INFO, "%s - disk layout missing", loc->path); ret = -1; } - goto out; - } - - count = ntoh32 (disk_layout[0]); - if (count != 1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s - disk layout has invalid count %d", - loc->path, count); - ret = -1; - goto out; - } - - start_off = ntoh32 (disk_layout[2]); - stop_off = ntoh32 (disk_layout[3]); - - if ((layout->list[pos].start != start_off) - || (layout->list[pos].stop != stop_off)) { - gf_log (this->name, GF_LOG_DEBUG, - "subvol: %s; inode layout - %"PRId32" - %"PRId32"; " - "disk layout - %"PRId32" - %"PRId32, - layout->list[pos].xlator->name, - layout->list[pos].start, layout->list[pos].stop, - start_off, stop_off); - ret = 1; - } else { - ret = 0; - } + goto out; + } + + memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); + + count = ntoh32 (disk_layout[0]); + if (count != 1) { + gf_log (this->name, GF_LOG_ERROR, + "%s - disk layout has invalid count %d", + loc->path, count); + ret = -1; + goto out; + } + + start_off = ntoh32 (disk_layout[2]); + stop_off = ntoh32 (disk_layout[3]); + + if ((layout->list[pos].start != start_off) + || (layout->list[pos].stop != stop_off)) { + gf_log (this->name, GF_LOG_INFO, + "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; " + "disk layout - %"PRIu32" - %"PRIu32, + layout->list[pos].xlator->name, + layout->list[pos].start, layout->list[pos].stop, + start_off, stop_off); + ret = 1; + } else { + ret = 0; + } out: - return ret; + return ret; } int -dht_layout_inode_set (xlator_t *this, xlator_t *subvol, inode_t *inode) +dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode) { dht_layout_t *layout = NULL; - int ret = -1; - - layout = dht_layout_for_subvol (this, subvol); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - subvol ? subvol->name : "<nil>"); - ret = -1; - goto out; - } + int ret = -1; + dht_conf_t *conf = NULL; + + conf = this->private; + if (!conf) + goto out; + + layout = dht_layout_for_subvol (this, subvol); + if (!layout) { + gf_log (this->name, GF_LOG_INFO, + "no pre-set layout for subvolume %s", + subvol ? subvol->name : "<nil>"); + ret = -1; + goto out; + } + + LOCK (&conf->layout_lock); + { + dht_inode_ctx_layout_set (inode, this, layout); + } + UNLOCK (&conf->layout_lock); - inode_ctx_put (inode, this, (uint64_t)(long)layout); - ret = 0; out: return ret; diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index 213d3f2bf..dbc9d0b3c 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -28,197 +19,310 @@ #include "compat.h" #include "dht-common.h" - - int -dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) +dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - dht_local_t *local = NULL; - - - local = frame->local; - local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - local->linkfile.inode, - &local->linkfile.stbuf); - - return 0; + char is_linkfile = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + if (!is_linkfile) + gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s", + prev->this->name, local->loc.path); +out: + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + inode, stbuf, postparent, postparent, + xattr); + return 0; } - +#define is_equal(a, b) (a == b) int dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf) + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - dict_t *xattr = NULL; - data_t *str_data = NULL; - int ret = -1; - - local = frame->local; - prev = cookie; - - if (op_ret == -1) - goto err; - - xattr = get_new_dict (); - if (!xattr) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->linkfile.xattr = dict_ref (xattr); - local->linkfile.inode = inode_ref (inode); - - str_data = str_to_data (local->linkfile.srcvol->name); - if (!str_data) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } - - ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to initialize linkfile data"); - op_errno = EINVAL; - } - str_data = NULL; - - local->linkfile.stbuf = *stbuf; - - STACK_WIND (frame, dht_linkfile_xattr_cbk, - prev->this, prev->this->fops->setxattr, - &local->linkfile.loc, local->linkfile.xattr, 0); - - return 0; - -err: - if (str_data) { - data_destroy (str_data); - str_data = NULL; - } - - local->linkfile.linkfile_cbk (frame, cookie, this, - op_ret, op_errno, inode, stbuf); - return 0; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + call_frame_t *prev = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + + local = frame->local; + + if (!op_ret) + local->linked = _gf_true; + + FRAME_SU_UNDO (frame, dht_local_t); + + if (op_ret && (op_errno == EEXIST)) { + conf = this->private; + prev = cookie; + subvol = prev->this; + if (!subvol) + goto out; + xattrs = dict_new (); + if (!xattrs) + goto out; + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set linkto key"); + goto out; + } + + STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol, + subvol->fops->lookup, &local->loc, xattrs); + if (xattrs) + dict_unref (xattrs); + return 0; + } +out: + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + inode, stbuf, preparent, postparent, + xdata); + if (xattrs) + dict_unref (xattrs); + return 0; } int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc) + xlator_t *this, + xlator_t *tovol, xlator_t *fromvol, loc_t *loc) { - dht_local_t *local = NULL; - - - local = frame->local; - local->linkfile.linkfile_cbk = linkfile_cbk; - local->linkfile.srcvol = tovol; - loc_copy (&local->linkfile.loc, loc); + dht_local_t *local = NULL; + dict_t *dict = NULL; + int need_unref = 0; + int ret = 0; + dht_conf_t *conf = this->private; + + local = frame->local; + local->linkfile.linkfile_cbk = linkfile_cbk; + local->linkfile.srcvol = tovol; + + local->linked = _gf_false; + + dict = local->params; + if (!dict) { + dict = dict_new (); + if (!dict) + goto out; + need_unref = 1; + } + + if (!uuid_is_null (local->gfid)) { + ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); + if (ret) + gf_log ("dht-linkfile", GF_LOG_INFO, + "%s: gfid set failed", loc->path); + } + + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log ("dht-linkfile", GF_LOG_INFO, + "%s: internal-fop set failed", loc->path); + + ret = dict_set_str (dict, conf->link_xattr_name, tovol->name); + + if (ret < 0) { + gf_log (frame->this->name, GF_LOG_INFO, + "%s: failed to initialize linkfile data", + loc->path); + goto out; + } + + local->link_subvol = fromvol; + /* Always create as root:root. dht_linkfile_attr_heal fixes the + * ownsership */ + FRAME_SU_DO (frame, dht_local_t); + STACK_WIND (frame, dht_linkfile_create_cbk, + fromvol, fromvol->fops->mknod, loc, + S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict); + + if (need_unref && dict) + dict_unref (dict); + + return 0; +out: + local->linkfile.linkfile_cbk (frame, NULL, frame->this, -1, ENOMEM, + loc->inode, NULL, NULL, NULL, NULL); - STACK_WIND (frame, dht_linkfile_create_cbk, - fromvol, fromvol->fops->mknod, loc, - S_IFREG | DHT_LINKFILE_MODE, 0); + if (need_unref && dict) + dict_unref (dict); - return 0; + return 0; } int dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; - local = frame->local; - prev = cookie; - subvol = prev->this; + local = frame->local; + prev = cookie; + subvol = prev->this; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "unlinking linkfile %s on %s failed (%s)", - local->loc.path, subvol->name, strerror (op_errno)); - } + if (op_ret == -1) { + gf_log (this->name, GF_LOG_INFO, + "unlinking linkfile %s on %s failed (%s)", + local->loc.path, subvol->name, strerror (op_errno)); + } - DHT_STACK_DESTROY (frame); + DHT_STACK_DESTROY (frame); - return 0; + return 0; } int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, - xlator_t *subvol, loc_t *loc) + xlator_t *subvol, loc_t *loc) { - call_frame_t *unlink_frame = NULL; - dht_local_t *unlink_local = NULL; - - unlink_frame = copy_frame (frame); - if (!unlink_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - unlink_local = dht_local_init (unlink_frame); - if (!unlink_local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - loc_copy (&unlink_local->loc, loc); - - STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, - subvol, subvol->fops->unlink, - &unlink_local->loc); - - return 0; + call_frame_t *unlink_frame = NULL; + dht_local_t *unlink_local = NULL; + + unlink_frame = copy_frame (frame); + if (!unlink_frame) { + goto err; + } + + /* Using non-fop value here, as anyways, 'local->fop' is not used in + this particular case */ + unlink_local = dht_local_init (unlink_frame, loc, NULL, + GF_FOP_MAXVALUE); + if (!unlink_local) { + goto err; + } + + STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, + subvol, subvol->fops->unlink, + &unlink_local->loc, 0, NULL); + + return 0; err: - if (unlink_frame) - DHT_STACK_DESTROY (unlink_frame); + if (unlink_frame) + DHT_STACK_DESTROY (unlink_frame); - return -1; + return -1; } xlator_t * -dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct stat *stbuf, - dict_t *xattr) +dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, + dict_t *xattr) { - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - void *volname = NULL; - int i = 0, ret = 0; - + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + void *volname = NULL; + int i = 0, ret = 0; - conf = this->private; + conf = this->private; - if (!xattr) - goto out; + if (!xattr) + goto out; - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname); - if ((-1 == ret) || !volname) - goto out; + if ((-1 == ret) || !volname) + goto out; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) { - subvol = conf->subvolumes[i]; - break; - } - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) { + subvol = conf->subvolumes[i]; + break; + } + } out: - return subvol; + return subvol; +} + +int +dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + loc_t *loc = NULL; + + local = frame->local; + loc = &local->loc; + + if (op_ret) + gf_log (this->name, GF_LOG_ERROR, "setattr of uid/gid on %s" + " :<gfid:%s> failed (%s)", + (loc->path? loc->path: "NULL"), + uuid_utoa(local->gfid), strerror(op_errno)); + + DHT_STACK_DESTROY (frame); + + return 0; } +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this) +{ + int ret = -1; + call_frame_t *copy = NULL; + dht_local_t *local = NULL; + dht_local_t *copy_local = NULL; + xlator_t *subvol = NULL; + struct iatt stbuf = {0,}; + + local = frame->local; + + GF_VALIDATE_OR_GOTO ("dht", local, out); + GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out); + + if (local->stbuf.ia_type == IA_INVAL) + return 0; + + uuid_copy (local->loc.gfid, local->stbuf.ia_gfid); + copy = copy_frame (frame); + + if (!copy) + goto out; + + copy_local = dht_local_init (copy, &local->loc, NULL, 0); + + if (!copy_local) + goto out; + + stbuf = local->stbuf; + subvol = local->link_subvol; + + copy->local = copy_local; + + FRAME_SU_DO (copy, dht_local_t); + + STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol, + subvol->fops->setattr, ©_local->loc, + &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); + ret = 0; +out: + return ret; +} diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h new file mode 100644 index 000000000..e893eb48f --- /dev/null +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -0,0 +1,35 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __DHT_MEM_TYPES_H__ +#define __DHT_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_dht_mem_types_ { + gf_dht_mt_dht_du_t = gf_common_mt_end + 1, + gf_dht_mt_dht_conf_t, + gf_dht_mt_char, + gf_dht_mt_int32_t, + gf_dht_mt_xlator_t, + gf_dht_mt_dht_layout_t, + gf_switch_mt_dht_conf_t, + gf_switch_mt_dht_du_t, + gf_switch_mt_switch_sched_array, + gf_switch_mt_switch_struct, + gf_dht_mt_subvol_time, + gf_dht_mt_loc_t, + gf_defrag_info_mt, + gf_dht_mt_inode_ctx_t, + gf_dht_mt_ctx_stat_time_t, + gf_dht_mt_end +}; +#endif diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c new file mode 100644 index 000000000..4f78f5203 --- /dev/null +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -0,0 +1,1908 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.h" +#include "xlator.h" +#include <signal.h> +#include <fnmatch.h> + +#define GF_DISK_SECTOR_SIZE 512 +#define DHT_REBALANCE_PID 4242 /* Change it if required */ +#define DHT_REBALANCE_BLKSIZE (128 * 1024) + +static int +dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, + int32_t size, off_t offset, struct iobref *iobref) +{ + int i = 0; + int ret = -1; + int start_idx = 0; + int tmp_offset = 0; + int write_needed = 0; + int buf_len = 0; + int size_pending = 0; + char *buf = NULL; + + /* loop through each vector */ + for (i = 0; i < count; i++) { + buf = vec[i].iov_base; + buf_len = vec[i].iov_len; + + for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len; + start_idx += GF_DISK_SECTOR_SIZE) { + + if (mem_0filled (buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) { + write_needed = 1; + continue; + } + + if (write_needed) { + ret = syncop_write (to, fd, (buf + tmp_offset), + (start_idx - tmp_offset), + (offset + tmp_offset), + iobref, 0); + /* 'path' will be logged in calling function */ + if (ret < 0) { + gf_log (THIS->name, GF_LOG_WARNING, + "failed to write (%s)", + strerror (-ret)); + ret = -1; + goto out; + } + + write_needed = 0; + } + tmp_offset = start_idx + GF_DISK_SECTOR_SIZE; + } + + if ((start_idx < buf_len) || write_needed) { + /* This means, last chunk is not yet written.. write it */ + ret = syncop_write (to, fd, (buf + tmp_offset), + (buf_len - tmp_offset), + (offset + tmp_offset), iobref, 0); + if (ret < 0) { + /* 'path' will be logged in calling function */ + gf_log (THIS->name, GF_LOG_WARNING, + "failed to write (%s)", + strerror (-ret)); + ret = -1; + goto out; + } + } + + size_pending = (size - buf_len); + if (!size_pending) + break; + } + + ret = size; +out: + return ret; + +} + +/* + return values: + -1 : failure + -2 : success + +Hard link migration is carried out in three stages. + +(Say there are n hardlinks) +Stage 1: Setting the new hashed subvol information on the 1st hardlink + encountered (linkto setxattr) + +Stage 2: Creating hardlinks on new hashed subvol for the 2nd to (n-1)th + hardlink + +Stage 3: Physical migration of the data file for nth hardlink + +Why to deem "-2" as success and not "0": + + dht_migrate_file expects return value "0" from _is_file_migratable if +the file has to be migrated. + + _is_file_migratable returns zero only when it is called with the +flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS". + + gf_defrag_handle_hardlink calls dht_migrate_file for physical migration +of the data file with the flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS" + +Hence, gf_defrag_handle_hardlink returning "0" for success will force +"dht_migrate_file" to migrate each of the hardlink which is not intended. + +For each of the three stage mentioned above "-2" will be returned and will +be converted to "0" in dht_migrate_file. + +*/ + +int32_t +gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, + struct iatt *stbuf) +{ + int32_t ret = -1; + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *linkto_subvol = NULL; + data_t *data = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("defrag", loc, out); + GF_VALIDATE_OR_GOTO ("defrag", loc->name, out); + GF_VALIDATE_OR_GOTO ("defrag", stbuf, out); + GF_VALIDATE_OR_GOTO ("defrag", this, out); + GF_VALIDATE_OR_GOTO ("defrag", xattrs, out); + GF_VALIDATE_OR_GOTO ("defrag", this->private, out); + + conf = this->private; + + if (uuid_is_null (loc->pargfid)) { + gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for " + "%s", loc->path); + goto out; + } + + if (uuid_is_null (loc->gfid)) { + gf_log ("", GF_LOG_ERROR, "loc->gfid is NULL for " + "%s", loc->path); + goto out; + } + + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get cached subvol" + " for %s on %s", loc->name, this->name); + goto out; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get hashed subvol" + " for %s on %s", loc->name, this->name); + goto out; + } + + gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s " + "with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid), + cached_subvol->name, hashed_subvol->name); + data = dict_get (xattrs, conf->link_xattr_name); + /* set linkto on cached -> hashed if not present, else link it */ + if (!data) { + ret = dict_set_str (xattrs, conf->link_xattr_name, + hashed_subvol->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "linkto xattr in dict for %s", loc->name); + goto out; + } + + ret = syncop_setxattr (cached_subvol, loc, xattrs, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Linkto setxattr " + "failed %s -> %s (%s)", cached_subvol->name, + loc->name, strerror (-ret)); + ret = -1; + goto out; + } + ret = -2; + goto out; + } else { + linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs); + if (!linkto_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "linkto subvol for %s", loc->name); + } else { + hashed_subvol = linkto_subvol; + } + + ret = syncop_link (hashed_subvol, loc, loc); + if (ret) { + op_errno = -ret; + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "link of %s -> %s" + " failed on subvol %s (%s)", loc->name, + uuid_utoa(loc->gfid), + hashed_subvol->name, strerror (op_errno)); + if (op_errno != EEXIST) + goto out; + } + } + ret = syncop_lookup (hashed_subvol, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed lookup %s on %s (%s)" + , loc->name, hashed_subvol->name, strerror (-ret)); + ret = -1; + goto out; + } + + if (iatt.ia_nlink == stbuf->ia_nlink) { + ret = dht_migrate_file (this, loc, cached_subvol, hashed_subvol, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS); + if (ret) + goto out; + } + ret = -2; +out: + return ret; +} + +/* + return values + 0 : File will be migrated + -2 : File will not be migrated + (This is the return value from gf_defrag_handle_hardlink. Checkout + gf_defrag_handle_hardlink for description of "returning -2") + -1 : failure +*/ +static inline int +__is_file_migratable (xlator_t *this, loc_t *loc, + struct iatt *stbuf, dict_t *xattrs, int flags) +{ + int ret = -1; + + if (IA_ISDIR (stbuf->ia_type)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: migrate-file called on directory", loc->path); + ret = -1; + goto out; + } + + if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) { + ret = 0; + goto out; + } + if (stbuf->ia_nlink > 1) { + /* support for decomission */ + if (flags == GF_DHT_MIGRATE_HARDLINK) { + ret = gf_defrag_handle_hardlink (this, loc, + xattrs, stbuf); + + /* + Returning zero will force the file to be remigrated. + Checkout gf_defrag_handle_hardlink for more information. + */ + if (ret && ret != -2) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to migrate file with link", + loc->path); + } + } else { + gf_log (this->name, GF_LOG_WARNING, + "%s: file has hardlinks", loc->path); + ret = -ENOTSUP; + } + goto out; + } + + ret = 0; + +out: + return ret; +} + +static inline int +__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, + dict_t *dict, fd_t **dst_fd, dict_t *xattr) +{ + xlator_t *this = NULL; + int ret = -1; + fd_t *fd = NULL; + struct iatt new_stbuf = {0,}; + dht_conf_t *conf = NULL; + + this = THIS; + conf = this->private; + + ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + ret = dict_set_str (dict, conf->link_xattr_name, from->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + fd = fd_create (loc->inode, DHT_REBALANCE_PID); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, + "%s: fd create failed (destination) (%s)", + loc->path, strerror (errno)); + ret = -1; + goto out; + } + + ret = syncop_lookup (to, loc, NULL, &new_stbuf, NULL, NULL); + if (!ret) { + /* File exits in the destination, check if gfid matches */ + if (uuid_compare (stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "file %s exits in %s with different gfid", + loc->path, to->name); + fd_unref (fd); + goto out; + } + } + if ((ret < 0) && (-ret != ENOENT)) { + /* File exists in destination, but not accessible */ + gf_log (THIS->name, GF_LOG_WARNING, + "%s: failed to lookup file (%s)", + loc->path, strerror (-ret)); + ret = -1; + goto out; + } + + /* Create the destination with LINKFILE mode, and linkto xattr, + if the linkfile already exists, it will just open the file */ + ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, + dict, &new_stbuf); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create %s on %s (%s)", + loc->path, to->name, strerror (-ret)); + ret = -1; + goto out; + } + + ret = syncop_fsetxattr (to, fd, xattr, 0); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set xattr on %s (%s)", + loc->path, to->name, strerror (-ret)); + + ret = syncop_ftruncate (to, fd, stbuf->ia_size); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate failed for %s on %s (%s)", + loc->path, to->name, strerror (-ret)); + + ret = syncop_fsetattr (to, fd, stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "chown failed for %s on %s (%s)", + loc->path, to->name, strerror (-ret)); + + if (dst_fd) + *dst_fd = fd; + + /* success */ + ret = 0; + +out: + return ret; +} + +static inline int +__dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, + struct iatt *stbuf, int flag) +{ + struct statvfs src_statfs = {0,}; + struct statvfs dst_statfs = {0,}; + int ret = -1; + xlator_t *this = NULL; + + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + + this = THIS; + + ret = syncop_statfs (from, loc, &src_statfs); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get statfs of %s on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + ret = syncop_statfs (to, loc, &dst_statfs); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get statfs of %s on %s (%s)", + loc->path, to->name, strerror (-ret)); + ret = -1; + goto out; + } + + /* if force option is given, do not check for space @ dst. + * Check only if space is avail for the file */ + if (flag != GF_DHT_MIGRATE_DATA) + goto check_avail_space; + + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid errorneous move to destination where + the space could be scantily available. + */ + if (stbuf) { + dst_statfs_blocks = ((dst_statfs.f_bavail * + dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + src_statfs_blocks = ((src_statfs.f_bavail * + src_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + if ((dst_statfs_blocks - stbuf->ia_blocks) < + (src_statfs_blocks + stbuf->ia_blocks)) { + gf_log (this->name, GF_LOG_WARNING, + "data movement attempted from node (%s) with" + " higher disk space to a node (%s) with " + "lesser disk space (%s)", from->name, + to->name, loc->path); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + ret = 1; + goto out; + } + } +check_avail_space: + if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { + gf_log (this->name, GF_LOG_ERROR, + "data movement attempted from node (%s) with " + "to node (%s) which does not have required free space" + " for %s", from->name, to->name, loc->path); + ret = 1; + goto out; + } + + ret = 0; +out: + return ret; +} + +static inline int +__dht_rebalance_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, + uint64_t ia_size, int hole_exists) +{ + int ret = 0; + int count = 0; + off_t offset = 0; + struct iovec *vector = NULL; + struct iobref *iobref = NULL; + uint64_t total = 0; + size_t read_size = 0; + + /* if file size is '0', no need to enter this loop */ + while (total < ia_size) { + read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ? + DHT_REBALANCE_BLKSIZE : (ia_size - total)); + ret = syncop_readv (from, src, read_size, + offset, 0, &vector, &count, &iobref); + if (!ret || (ret < 0)) { + break; + } + + if (hole_exists) + ret = dht_write_with_holes (to, dst, vector, count, + ret, offset, iobref); + else + ret = syncop_writev (to, dst, vector, count, + offset, iobref, 0); + if (ret < 0) { + break; + } + offset += ret; + total += ret; + + GF_FREE (vector); + if (iobref) + iobref_unref (iobref); + iobref = NULL; + vector = NULL; + } + if (iobref) + iobref_unref (iobref); + GF_FREE (vector); + + if (ret >= 0) + ret = 0; + else + ret = -1; + + return ret; +} + + +static inline int +__dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, + struct iatt *stbuf, fd_t **src_fd) +{ + int ret = 0; + fd_t *fd = NULL; + dict_t *dict = NULL; + xlator_t *this = NULL; + struct iatt iatt = {0,}; + dht_conf_t *conf = NULL; + + this = THIS; + conf = this->private; + + fd = fd_create (loc->inode, DHT_REBALANCE_PID); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, + "%s: fd create failed (source)", loc->path); + ret = -1; + goto out; + } + + ret = syncop_open (from, loc, O_RDWR, fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to open file %s on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + ret = -1; + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_set_str (dict, conf->link_xattr_name, to->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set xattr in dict for %s (linkto:%s)", + loc->path, to->name); + goto out; + } + + /* Once the migration starts, the source should have 'linkto' key set + to show which is the target, so other clients can work around it */ + ret = syncop_setxattr (from, loc, dict, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set xattr on %s in %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + /* mode should be (+S+T) to indicate migration is in progress */ + iatt.ia_prot = stbuf->ia_prot; + iatt.ia_type = stbuf->ia_type; + iatt.ia_prot.sticky = 1; + iatt.ia_prot.sgid = 1; + + ret = syncop_setattr (from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set mode on %s in %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + if (src_fd) + *src_fd = fd; + + /* success */ + ret = 0; +out: + if (dict) + dict_unref (dict); + + return ret; +} + +int +migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, + struct iatt *buf) +{ + int ret = -1; + dict_t *rsp_dict = NULL; + dict_t *dict = NULL; + char *link = NULL; + struct iatt stbuf = {0,}; + dht_conf_t *conf = this->private; + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } + + /* check in the destination if the file is link file */ + ret = syncop_lookup (to, loc, dict, &stbuf, &rsp_dict, NULL); + if ((ret < 0) && (-ret != ENOENT)) { + gf_log (this->name, GF_LOG_WARNING, "%s: lookup failed (%s)", + loc->path, strerror (-ret)); + ret = -1; + goto out; + } + + /* we no more require this key */ + dict_del (dict, conf->link_xattr_name); + + /* file exists in target node, only if it is 'linkfile' its valid, + otherwise, error out */ + if (!ret) { + if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict, + conf->link_xattr_name)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: file exists in destination", loc->path); + ret = -1; + goto out; + } + + /* as file is linkfile, delete it */ + ret = syncop_unlink (to, loc); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to delete the linkfile (%s)", + loc->path, strerror (-ret)); + ret = -1; + goto out; + } + } + + /* Set the gfid of the source file in dict */ + ret = dict_set_static_bin (dict, "gfid-req", buf->ia_gfid, 16); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + /* Create the file in target */ + if (IA_ISLNK (buf->ia_type)) { + /* Handle symlinks separately */ + ret = syncop_readlink (from, loc, &link, buf->ia_size); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: readlink on symlink failed (%s)", + loc->path, strerror (-ret)); + ret = -1; + goto out; + } + + ret = syncop_symlink (to, loc, link, dict, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: creating symlink failed (%s)", + loc->path, strerror (-ret)); + ret = -1; + goto out; + } + + goto done; + } + + ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot, + buf->ia_type), + makedev (ia_major (buf->ia_rdev), + ia_minor (buf->ia_rdev)), dict, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)", + loc->path, strerror (-ret)); + ret = -1; + goto out; + } + +done: + ret = syncop_setattr (to, loc, buf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (-ret)); + ret = -1; + } + + ret = syncop_unlink (from, loc); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)", + loc->path, strerror (-ret)); + ret = -1; + } + +out: + if (dict) + dict_unref (dict); + + if (rsp_dict) + dict_unref (rsp_dict); + + return ret; +} + +/* + return values: + + -1 : failure + 0 : successfully migrated data + 1 : not a failure, but we can't migrate data as of now +*/ +int +dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag) +{ + int ret = -1; + struct iatt new_stbuf = {0,}; + struct iatt stbuf = {0,}; + struct iatt empty_iatt = {0,}; + ia_prot_t src_ia_prot = {0,}; + fd_t *src_fd = NULL; + fd_t *dst_fd = NULL; + dict_t *dict = NULL; + dict_t *xattr = NULL; + dict_t *xattr_rsp = NULL; + int file_has_holes = 0; + dht_conf_t *conf = this->private; + + gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s", + loc->path, from->name, to->name); + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } + + /* Phase 1 - Data migration is in progress from now on */ + ret = syncop_lookup (from, loc, dict, &stbuf, &xattr_rsp, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: lookup failed on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + /* we no more require this key */ + dict_del (dict, conf->link_xattr_name); + + /* preserve source mode, so set the same to the destination */ + src_ia_prot = stbuf.ia_prot; + + /* Check if file can be migrated */ + ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag); + if (ret) { + if (ret == -2) + ret = 0; + goto out; + } + /* Take care of the special files */ + if (!IA_ISREG (stbuf.ia_type)) { + /* Special files */ + ret = migrate_special_files (this, from, to, loc, &stbuf); + goto out; + } + + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr (from, loc, &xattr); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get xattr from %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + } + + /* create the destination, with required modes/xattr */ + ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, + dict, &dst_fd, xattr); + if (ret) + goto out; + + ret = __dht_check_free_space (to, from, loc, &stbuf, flag); + if (ret) { + goto out; + } + + /* Open the source, and also update mode/xattr */ + ret = __dht_rebalance_open_src_file (from, to, loc, &stbuf, &src_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to open %s on %s", + loc->path, from->name); + goto out; + } + + + ret = syncop_fstat (from, src_fd, &stbuf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + /* Try to preserve 'holes' while migrating data */ + if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE)) + file_has_holes = 1; + + /* All I/O happens in this function */ + ret = __dht_rebalance_migrate_data (from, to, src_fd, dst_fd, + stbuf.ia_size, file_has_holes); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: failed to migrate data", + loc->path); + /* reset the destination back to 0 */ + ret = syncop_ftruncate (to, dst_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to reset target size back to 0 (%s)", + loc->path, strerror (-ret)); + } + + ret = -1; + goto out; + } + + /* TODO: Sync the locks */ + + ret = syncop_fsync (to, dst_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to fsync on %s (%s)", + loc->path, to->name, strerror (-ret)); + ret = -1; + } + + + /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */ + + ret = syncop_fstat (from, src_fd, &new_stbuf); + if (ret < 0) { + /* Failed to get the stat info */ + gf_log (this->name, GF_LOG_ERROR, + "failed to fstat file %s on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + /* source would have both sticky bit and sgid bit set, reset it to 0, + and set the source permission on destination, if it was not set + prior to setting rebalance-modes in source */ + if (!src_ia_prot.sticky) + new_stbuf.ia_prot.sticky = 0; + + if (!src_ia_prot.sgid) + new_stbuf.ia_prot.sgid = 0; + + /* TODO: if the source actually had sticky bit, or sgid bit set, + we are not handling it */ + + ret = syncop_fsetattr (to, dst_fd, &new_stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (-ret)); + ret = -1; + goto out; + } + + /* Because 'futimes' is not portable */ + ret = syncop_setattr (to, loc, &new_stbuf, + (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME), + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (-ret)); + ret = -1; + } + + /* Make the source as a linkfile first before deleting it */ + empty_iatt.ia_prot.sticky = 1; + ret = syncop_fsetattr (from, src_fd, &empty_iatt, + GF_SET_ATTR_MODE, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, \ + "%s: failed to perform setattr on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate (from, src_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", + loc->path, to->name, strerror (-ret)); + ret = -1; + } + + /* Do a stat and check the gfid before unlink */ + ret = syncop_stat (from, loc, &empty_iatt); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to do a stat on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + + if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) { + /* take out the source from namespace */ + ret = syncop_unlink (from, loc); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform unlink on %s (%s)", + loc->path, from->name, strerror (-ret)); + ret = -1; + goto out; + } + } + + ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "%s: failed to lookup the file on subvolumes (%s)", + loc->path, strerror (-ret)); + ret = -1; + } + + gf_log (this->name, GF_LOG_INFO, + "completed migration of %s from subvolume %s to %s", + loc->path, from->name, to->name); + + ret = 0; +out: + if (dict) + dict_unref (dict); + + if (xattr) + dict_unref (xattr); + if (xattr_rsp) + dict_unref (xattr_rsp); + + if (dst_fd) + syncop_close (dst_fd); + if (src_fd) + syncop_close (src_fd); + + return ret; +} + +static int +rebalance_task (void *data) +{ + int ret = -1; + dht_local_t *local = NULL; + call_frame_t *frame = NULL; + + frame = data; + + local = frame->local; + + /* This function is 'synchrounous', hence if it returns, + we are done with the task */ + ret = dht_migrate_file (THIS, &local->loc, local->rebalance.from_subvol, + local->rebalance.target_node, local->flags); + + return ret; +} + +static int +rebalance_task_completion (int op_ret, call_frame_t *sync_frame, void *data) +{ + int ret = -1; + uint64_t layout_int = 0; + dht_layout_t *layout = 0; + xlator_t *this = NULL; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + this = THIS; + local = sync_frame->local; + + if (!op_ret) { + /* Make sure we have valid 'layout' in inode ctx + after the operation */ + ret = inode_ctx_del (local->loc.inode, this, &layout_int); + if (!ret && layout_int) { + layout = (dht_layout_t *)(long)layout_int; + dht_layout_unref (this, layout); + } + + ret = dht_layout_preset (this, local->rebalance.target_node, + local->loc.inode); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set inode ctx", local->loc.path); + } + + if (op_ret == -1) { + /* Failure of migration process, mostly due to write process. + as we can't preserve the exact errno, lets say there was + no space to migrate-data + */ + op_errno = ENOSPC; + } + + if (op_ret == 1) { + /* migration didn't happen, but is not a failure, let the user + understand that he doesn't have permission to migrate the + file. + */ + op_ret = -1; + op_errno = EPERM; + } + + DHT_STACK_UNWIND (setxattr, sync_frame, op_ret, op_errno, NULL); + return 0; +} + +int +dht_start_rebalance_task (xlator_t *this, call_frame_t *frame) +{ + int ret = -1; + + ret = synctask_new (this->ctx->env, rebalance_task, + rebalance_task_completion, + frame, frame); + return ret; +} + +int +gf_listener_stop (xlator_t *this) +{ + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + int ret = 0; + + ctx = this->ctx; + GF_ASSERT (ctx); + cmd_args = &ctx->cmd_args; + if (cmd_args->sock_file) { + ret = unlink (cmd_args->sock_file); + if (ret && (ENOENT == errno)) { + ret = 0; + } + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to unlink listener " + "socket %s, error: %s", cmd_args->sock_file, + strerror (errno)); + } + return ret; +} + +void +dht_build_root_inode (xlator_t *this, inode_t **inode) +{ + inode_table_t *itable = NULL; + uuid_t root_gfid = {0, }; + + itable = inode_table_new (0, this); + if (!itable) + return; + + root_gfid[15] = 1; + *inode = inode_find (itable, root_gfid); +} + +void +dht_build_root_loc (inode_t *inode, loc_t *loc) +{ + loc->path = "/"; + loc->inode = inode; + loc->inode->ia_type = IA_IFDIR; + memset (loc->gfid, 0, 16); + loc->gfid[15] = 1; +} + + +/* return values: 1 -> error, bug ignore and continue + 0 -> proceed + -1 -> error, handle it */ +int32_t +gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag) +{ + /* if errno is not ENOSPC or ENOTCONN, we can still continue + with rebalance process */ + if ((op_errno != ENOSPC) || (op_errno != ENOTCONN)) + return 1; + + if (op_errno == ENOTCONN) { + /* Most probably mount point went missing (mostly due + to a brick down), say rebalance failure to user, + let him restart it if everything is fine */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + if (op_errno == ENOSPC) { + /* rebalance process itself failed, may be + remote brick went down, or write failed due to + disk full etc etc.. */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + return 0; +} + +static gf_boolean_t +gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size) +{ + gf_defrag_pattern_list_t *trav = NULL; + gf_boolean_t match = _gf_false; + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO ("dht", defrag, out); + + trav = defrag->defrag_pattern; + while (trav) { + if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) { + match = _gf_true; + break; + } + trav = trav->next; + } + + if ((match == _gf_true) && (size >= trav->size)) + ret = _gf_true; + + out: + return ret; +} + +/* We do a depth first traversal of directories. But before we move into + * subdirs, we complete the data migration of those directories whose layouts + * have been fixed + */ + +int +gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + off_t offset = 0; + dict_t *dict = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + char *uuid_str = NULL; + uuid_t node_uuid = {0,}; + struct timeval dir_start = {0,}; + struct timeval end = {0,}; + double elapsed = {0,}; + struct timeval start = {0,}; + int32_t err = 0; + int loglevel = GF_LOG_TRACE; + + gf_log (this->name, GF_LOG_INFO, "migrate data called on %s", + loc->path); + gettimeofday (&dir_start, NULL); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) { + + if (ret < 0) { + + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s." + " Aborting migrate-data", + strerror(-ret)); + ret = -1; + goto out; + } + + if (list_empty (&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (IA_ISDIR (entry->d_stat.ia_type)) + continue; + + defrag->num_files_lookedup++; + if (defrag->stats == _gf_true) { + gettimeofday (&start, NULL); + } + if (defrag->defrag_pattern && + (gf_defrag_pattern_match (defrag, entry->d_name, + entry->d_stat.ia_size) + == _gf_false)) { + continue; + } + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + ret = -1; + continue; + } + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_NODE_UUID_KEY); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid for %s", entry_loc.path); + ret = -1; + continue; + } + + ret = dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, + &uuid_str); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid from dict for %s", + entry_loc.path); + ret = -1; + continue; + } + + if (uuid_parse (uuid_str, node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "uuid_parse " + "failed for %s", entry_loc.path); + continue; + } + + /* if file belongs to different node, skip migration + * the other node will take responsibility of migration + */ + if (uuid_compare (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_TRACE, "%s does not" + "belong to this node", entry_loc.path); + continue; + } + + uuid_str = NULL; + + dict_del (dict, GF_XATTR_NODE_UUID_KEY); + + + /* if distribute is present, it will honor this key. + * -1, ENODATA is returned if distribute is not present + * or file doesn't have a link-file. If file has + * link-file, the path of link-file will be the value, + * and also that guarantees that file has to be mostly + * migrated */ + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_LINKINFO_KEY); + if (ret < 0) { + if (-ret != ENODATA) { + loglevel = GF_LOG_ERROR; + defrag->total_failures += 1; + } else { + loglevel = GF_LOG_TRACE; + } + gf_log (this->name, loglevel, "%s: failed to " + "get "GF_XATTR_LINKINFO_KEY" key - %s", + entry_loc.path, strerror (-ret)); + ret = -1; + continue; + } + + ret = syncop_setxattr (this, &entry_loc, migrate_data, + 0); + if (ret) { + err = op_errno; + /* errno is overloaded. See + * rebalance_task_completion () */ + if (err != ENOSPC) { + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data skipped for %s" + " due to space constraints", + entry_loc.path); + defrag->skipped +=1; + } else{ + gf_log (this->name, GF_LOG_ERROR, + "migrate-data failed for %s", + entry_loc.path); + defrag->total_failures +=1; + } + } + + if (ret < 0) { + op_errno = -ret; + ret = gf_defrag_handle_migrate_error (op_errno, + defrag); + + if (!ret) + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data on %s failed: %s", + entry_loc.path, + strerror (op_errno)); + else if (ret == 1) + continue; + else if (ret == -1) + goto out; + } + + LOCK (&defrag->lock); + { + defrag->total_files += 1; + defrag->total_data += iatt.ia_size; + } + UNLOCK (&defrag->lock); + if (defrag->stats == _gf_true) { + gettimeofday (&end, NULL); + elapsed = (end.tv_sec - start.tv_sec) * 1e6 + + (end.tv_usec - start.tv_usec); + gf_log (this->name, GF_LOG_INFO, "Migration of " + "file:%s size:%"PRIu64" bytes took %.2f" + "secs", entry_loc.path, iatt.ia_size, + elapsed/1e6); + } + } + + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + } + + gettimeofday (&end, NULL); + elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 + + (end.tv_usec - dir_start.tv_usec); + gf_log (this->name, GF_LOG_INFO, "Migration operation on dir %s took " + "%.2f secs", loc->path, elapsed/1e6); + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + return ret; + +} + +int +gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout, dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + dict_t *dict = NULL; + off_t offset = 0; + struct iatt iatt = {0,}; + + ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s", + loc->path); + ret = -1; + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + ret = gf_defrag_migrate_data (this, defrag, loc, migrate_data); + if (ret) + goto out; + } + + gf_log (this->name, GF_LOG_TRACE, "fix layout called on %s", loc->path); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + ret = -1; + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&entries.list); + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) + { + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s" + ". Aborting fix-layout",strerror(-ret)); + ret = -1; + goto out; + } + + if (list_empty (&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (!IA_ISDIR (entry->d_stat.ia_type)) + continue; + + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + ret = -1; + continue; + } + + ret = syncop_setxattr (this, &entry_loc, fix_layout, + 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Setxattr " + "failed for %s", entry_loc.path); + defrag->defrag_status = + GF_DEFRAG_STATUS_FAILED; + defrag->total_failures ++; + ret = -1; + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &entry_loc, + fix_layout, migrate_data); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Fix layout " + "failed for %s", entry_loc.path); + defrag->total_failures++; + goto out; + } + + } + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + } + + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + + return ret; + +} + + +int +gf_defrag_start_crawl (void *data) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = -1; + loc_t loc = {0,}; + struct iatt iatt = {0,}; + struct iatt parent = {0,}; + dict_t *fix_layout = NULL; + dict_t *migrate_data = NULL; + dict_t *status = NULL; + glusterfs_ctx_t *ctx = NULL; + + this = data; + if (!this) + goto out; + + ctx = this->ctx; + if (!ctx) + goto out; + + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + gettimeofday (&defrag->start_time, NULL); + dht_build_root_inode (this, &defrag->root_inode); + if (!defrag->root_inode) + goto out; + + dht_build_root_loc (defrag->root_inode, &loc); + + /* fix-layout on '/' first */ + + ret = syncop_lookup (this, &loc, NULL, &iatt, NULL, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "look up on / failed"); + ret = -1; + goto out; + } + + fix_layout = dict_new (); + if (!fix_layout) { + ret = -1; + goto out; + } + + ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set dict str"); + goto out; + } + + ret = syncop_setxattr (this, &loc, fix_layout, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", + loc.path); + defrag->total_failures++; + ret = -1; + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + migrate_data = dict_new (); + if (!migrate_data) { + ret = -1; + goto out; + } + if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) + ret = dict_set_str (migrate_data, + "distribute.migrate-data", "force"); + else + ret = dict_set_str (migrate_data, + "distribute.migrate-data", + "non-force"); + if (ret) + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout, + migrate_data); + if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) && + (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) { + defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + } + + + +out: + LOCK (&defrag->lock); + { + status = dict_new (); + gf_defrag_status_get (defrag, status); + if (ctx->notify) + ctx->notify (GF_EN_DEFRAG_STATUS, status); + if (status) + dict_unref (status); + defrag->is_exiting = 1; + } + UNLOCK (&defrag->lock); + + if (defrag) { + GF_FREE (defrag); + conf->defrag = NULL; + } + + return ret; +} + + +static int +gf_defrag_done (int ret, call_frame_t *sync_frame, void *data) +{ + gf_listener_stop (sync_frame->this); + + STACK_DESTROY (sync_frame->root); + kill (getpid(), SIGTERM); + return 0; +} + +void * +gf_defrag_start (void *data) +{ + int ret = -1; + call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + xlator_t *this = NULL; + + this = data; + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + + frame->root->pid = GF_CLIENT_PID_DEFRAG; + + defrag->pid = frame->root->pid; + + defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + + ret = synctask_new (this->ctx->env, gf_defrag_start_crawl, + gf_defrag_done, frame, this); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Could not create" + " task for rebalance"); +out: + return NULL; +} + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) +{ + int ret = 0; + uint64_t files = 0; + uint64_t size = 0; + uint64_t lookup = 0; + uint64_t failures = 0; + uint64_t skipped = 0; + char *status = ""; + double elapsed = 0; + struct timeval end = {0,}; + + + if (!defrag) + goto out; + + ret = 0; + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) + goto out; + + files = defrag->total_files; + size = defrag->total_data; + lookup = defrag->num_files_lookedup; + failures = defrag->total_failures; + skipped = defrag->skipped; + + gettimeofday (&end, NULL); + + elapsed = end.tv_sec - defrag->start_time.tv_sec; + + if (!dict) + goto log; + + ret = dict_set_uint64 (dict, "files", files); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set file count"); + + ret = dict_set_uint64 (dict, "size", size); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set size of xfer"); + + ret = dict_set_uint64 (dict, "lookups", lookup); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set lookedup file count"); + + + ret = dict_set_int32 (dict, "status", defrag->defrag_status); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set status"); + if (elapsed) { + ret = dict_set_double (dict, "run-time", elapsed); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set run-time"); + } + + ret = dict_set_uint64 (dict, "failures", failures); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set failure count"); + + ret = dict_set_uint64 (dict, "skipped", skipped); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set skipped file count"); +log: + switch (defrag->defrag_status) { + case GF_DEFRAG_STATUS_NOT_STARTED: + status = "not started"; + break; + case GF_DEFRAG_STATUS_STARTED: + status = "in progress"; + break; + case GF_DEFRAG_STATUS_STOPPED: + status = "stopped"; + break; + case GF_DEFRAG_STATUS_COMPLETE: + status = "completed"; + break; + case GF_DEFRAG_STATUS_FAILED: + status = "failed"; + break; + default: + break; + } + + gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " + "secs", status, elapsed); + gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" + PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: " + "%"PRIu64, files, size, lookup, failures, skipped); + + +out: + return 0; +} + +int +gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status, + dict_t *output) +{ + /* TODO: set a variable 'stop_defrag' here, it should be checked + in defrag loop */ + int ret = -1; + GF_ASSERT (defrag); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { + goto out; + } + + gf_log ("", GF_LOG_INFO, "Received stop command on rebalance"); + defrag->defrag_status = status; + + if (output) + gf_defrag_status_get (defrag, output); + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 6c34dabad..925538cc8 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should @@ -33,373 +24,801 @@ int dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - - local = frame->local; - prev = cookie; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + /* TODO: undo the damage */ + + gf_log (this->name, GF_LOG_INFO, + "rename %s -> %s on %s failed (%s)", + local->loc.path, local->loc2.path, + prev->this->name, strerror (op_errno)); + + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + /* TODO: construct proper stbuf for dir */ + /* + * FIXME: is this the correct way to build stbuf and + * parent bufs? + */ + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preoldparent, preoldparent, + prev->this); + dht_iatt_merge (this, &local->postoldparent, postoldparent, + prev->this); + dht_iatt_merge (this, &local->preparent, prenewparent, + prev->this); + dht_iatt_merge (this, &local->postparent, postnewparent, + prev->this); - if (op_ret == -1) { - /* TODO: undo the damage */ - gf_log (this->name, GF_LOG_DEBUG, - "rename %s -> %s on %s failed (%s)", - local->loc.path, local->loc2.path, - prev->this->name, strerror (op_errno)); - - local->op_ret = op_ret; - local->op_errno = op_errno; - } else { - /* TODO: construct proper stbuf for dir */ - local->stbuf = *stbuf; - } +unwind: + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + WIPE (&local->preoldparent); + WIPE (&local->postoldparent); + WIPE (&local->preparent); + WIPE (&local->postparent); + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, + &local->preparent, &local->postparent, xdata); + } + + return 0; +} - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - return 0; +int +dht_rename_hashed_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, + struct iatt *postoldparent, + struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int call_cnt = 0; + call_frame_t *prev = NULL; + int i = 0; + + conf = this->private; + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + /* TODO: undo the damage */ + + gf_log (this->name, GF_LOG_INFO, + "rename %s -> %s on %s failed (%s)", + local->loc.path, local->loc2.path, + prev->this->name, strerror (op_errno)); + + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + /* TODO: construct proper stbuf for dir */ + /* + * FIXME: is this the correct way to build stbuf and + * parent bufs? + */ + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preoldparent, preoldparent, + prev->this); + dht_iatt_merge (this, &local->postoldparent, postoldparent, + prev->this); + dht_iatt_merge (this, &local->preparent, prenewparent, + prev->this); + dht_iatt_merge (this, &local->postparent, postnewparent, + prev->this); + + call_cnt = local->call_cnt = conf->subvolume_cnt - 1; + + if (!local->call_cnt) + goto unwind; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == local->dst_hashed) + continue; + STACK_WIND (frame, dht_rename_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->rename, + &local->loc, &local->loc2, NULL); + if (!--call_cnt) + break; + } + + + return 0; +unwind: + WIPE (&local->preoldparent); + WIPE (&local->postoldparent); + WIPE (&local->preparent); + WIPE (&local->postparent); + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, + &local->preparent, &local->postparent, NULL); + + return 0; } - int dht_rename_dir_do (call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int i = 0; + dht_local_t *local = NULL; - conf = this->private; - local = frame->local; + local = frame->local; - if (local->op_ret == -1) - goto err; + if (local->op_ret == -1) + goto err; - local->call_cnt = conf->subvolume_cnt; - local->op_ret = 0; + local->op_ret = 0; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rename_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->rename, - &local->loc, &local->loc2); - } - - return 0; + STACK_WIND (frame, dht_rename_hashed_dir_cbk, + local->dst_hashed, + local->dst_hashed->fops->rename, + &local->loc, &local->loc2, NULL); + return 0; err: - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); - return 0; + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL, NULL, NULL, NULL); + return 0; } int dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; - local = frame->local; - prev = cookie; + local = frame->local; + prev = cookie; - if (op_ret > 2) { - gf_log (this->name, GF_LOG_TRACE, - "readdir on %s for %s returned %d entries", - prev->this->name, local->loc.path, op_ret); - local->op_ret = -1; - local->op_errno = ENOTEMPTY; - } + if (op_ret > 2) { + gf_log (this->name, GF_LOG_TRACE, + "readdir on %s for %s returned %d entries", + prev->this->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } - this_call_cnt = dht_frame_return (frame); + this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_rename_dir_do (frame, this); - } + if (is_last_call (this_call_cnt)) { + dht_rename_dir_do (frame, this); + } - return 0; + return 0; } int dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; - local = frame->local; - prev = cookie; + local = frame->local; + prev = cookie; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "opendir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto err; - } + if (op_ret == -1) { + gf_log (this->name, GF_LOG_INFO, + "opendir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto err; + } - STACK_WIND (frame, dht_rename_readdir_cbk, - prev->this, prev->this->fops->readdir, - local->fd, 4096, 0); + STACK_WIND (frame, dht_rename_readdir_cbk, + prev->this, prev->this->fops->readdir, + local->fd, 4096, 0, NULL); - return 0; + return 0; err: - this_call_cnt = dht_frame_return (frame); + this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_rename_dir_do (frame, this); - } + if (is_last_call (this_call_cnt)) { + dht_rename_dir_do (frame, this); + } - return 0; + return 0; } int dht_rename_dir (call_frame_t *frame, xlator_t *this) { - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int i = 0; - int op_errno = -1; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int op_errno = -1; - conf = frame->this->private; - local = frame->local; + conf = frame->this->private; + local = frame->local; - local->call_cnt = conf->subvolume_cnt; + local->call_cnt = conf->subvolume_cnt; - local->fd = fd_create (local->loc.inode, frame->root->pid); - if (!local->fd) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + gf_log (this->name, GF_LOG_INFO, + "one of the subvolumes down (%s)", + conf->subvolumes[i]->name); + op_errno = ENOTCONN; + goto err; + } + } - local->op_ret = 0; + local->fd = fd_create (local->loc.inode, frame->root->pid); + if (!local->fd) { + op_errno = ENOMEM; + goto err; + } - if (!local->dst_cached) { - dht_rename_dir_do (frame, this); - return 0; - } + local->op_ret = 0; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rename_opendir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - &local->loc2, local->fd); - } + if (!local->dst_cached) { + dht_rename_dir_do (frame, this); + return 0; + } - return 0; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rename_opendir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + &local->loc2, local->fd, NULL); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL); - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } - +#define DHT_MARK_FOP_INTERNAL(xattr) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ + " internal dict key for %s", local->loc.path); \ + } \ + }while (0) + +#define DHT_MARKER_DONT_ACCOUNT(xattr) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str (xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, \ + "yes"); \ + if (tmp) { \ + gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ + " marker dont account key for %s", local->loc.path); \ + } \ + }while (0) int -dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +dht_rename_done (call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL; - local = frame->local; - prev = cookie; + local = frame->local; - this_call_cnt = dht_frame_return (frame); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "unlink on %s failed (%s)", - prev->this->name, strerror (op_errno)); - } + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, &local->preparent, + &local->postparent, NULL); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); + return 0; +} - return 0; +int +dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "!local, should not happen"); + goto out; + } + + this_call_cnt = dht_frame_return (frame); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s: unlink on %s failed (%s)", + local->loc.path, prev->this->name, strerror (op_errno)); + } + + WIPE (&local->preoldparent); + WIPE (&local->postoldparent); + WIPE (&local->preparent); + WIPE (&local->postparent); + + if (is_last_call (this_call_cnt)) { + dht_rename_done (frame, this); + } + +out: + return 0; } int -dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) +dht_rename_cleanup (call_frame_t *frame) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *src_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *dst_cached = NULL; - xlator_t *rename_subvol = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + dict_t *xattr = NULL; - local = frame->local; - prev = cookie; + local = frame->local; + this = frame->this; - src_hashed = local->src_hashed; - src_cached = local->src_cached; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "rename on %s failed (%s)", prev->this->name, - strerror (op_errno)); - local->op_ret = op_ret; - local->op_errno = op_errno; - goto unwind; - } - - /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk - * is called. since rename has already happened on rename_subvol, - * unlink should not be sent for oldpath (either linkfile or cached-file) - * on rename_subvol. */ - if (src_cached == dst_cached) - rename_subvol = src_cached; - else - rename_subvol = dst_hashed; + if (src_cached == dst_cached) + goto nolinks; - /* TODO: delete files in background */ + if (dst_hashed != src_hashed && dst_hashed != src_cached) + call_cnt++; - if (src_cached != dst_hashed && src_cached != dst_cached) - local->call_cnt++; + if (src_cached != dst_hashed) + call_cnt++; - if (src_hashed != rename_subvol && src_hashed != src_cached) - local->call_cnt++; + local->call_cnt = call_cnt; - if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) - local->call_cnt++; + if (!call_cnt) + goto nolinks; - if (local->call_cnt == 0) - goto unwind; + DHT_MARK_FOP_INTERNAL (xattr); - if (src_cached != dst_hashed && src_cached != dst_cached) { - gf_log (this->name, GF_LOG_TRACE, - "deleting old src datafile %s @ %s", - local->loc.path, src_cached->name); + if (dst_hashed != src_hashed && dst_hashed != src_cached) { + dict_t *xattr_new = NULL; - STACK_WIND (frame, dht_rename_unlink_cbk, - src_cached, src_cached->fops->unlink, - &local->loc); - } + gf_log (this->name, GF_LOG_TRACE, + "unlinking linkfile %s @ %s => %s", + local->loc.path, dst_hashed->name, src_cached->name); - if (src_hashed != rename_subvol && src_hashed != src_cached) { - gf_log (this->name, GF_LOG_TRACE, - "deleting old src linkfile %s @ %s", - local->loc.path, src_hashed->name); + xattr_new = dict_copy_with_ref (xattr, NULL); - STACK_WIND (frame, dht_rename_unlink_cbk, - src_hashed, src_hashed->fops->unlink, - &local->loc); - } - if (dst_cached - && (dst_cached != dst_hashed) - && (dst_cached != src_cached)) { - gf_log (this->name, GF_LOG_TRACE, - "deleting old dst datafile %s @ %s", - local->loc2.path, dst_cached->name); + DHT_MARKER_DONT_ACCOUNT(xattr_new); - STACK_WIND (frame, dht_rename_unlink_cbk, - dst_cached, dst_cached->fops->unlink, - &local->loc2); - } - return 0; + STACK_WIND (frame, dht_rename_unlink_cbk, + dst_hashed, dst_hashed->fops->unlink, + &local->loc, 0, xattr_new); + + dict_unref (xattr_new); + xattr_new = NULL; + } + + if (src_cached != dst_hashed) { + dict_t *xattr_new = NULL; + + gf_log (this->name, GF_LOG_TRACE, + "unlinking link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + + xattr_new = dict_copy_with_ref (xattr, NULL); + + if (uuid_compare (local->loc.pargfid, + local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); + } + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_cached, src_cached->fops->unlink, + &local->loc2, 0, xattr_new); + + dict_unref (xattr_new); + xattr_new = NULL; + } + + if (xattr) + dict_unref (xattr); + + return 0; + +nolinks: + WIPE (&local->preoldparent); + WIPE (&local->postoldparent); + WIPE (&local->preparent); + WIPE (&local->postparent); + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, &local->preparent, + &local->postparent, NULL); + + return 0; +} + + +int +dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + call_frame_t *prev = NULL; + dht_local_t *local = NULL; + + prev = cookie; + local = frame->local; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "link/file %s on %s failed (%s)", + local->loc.path, prev->this->name, strerror (op_errno)); + } + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } + DHT_STACK_DESTROY (frame); + + return 0; +} + + +int +dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *rename_subvol = NULL; + call_frame_t *link_frame = NULL; + dht_local_t *link_local = NULL; + dict_t *xattr = NULL; + + local = frame->local; + prev = cookie; + + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (local->linked == _gf_true) + FRAME_SU_UNDO (frame, dht_local_t); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s: rename on %s failed (%s)", local->loc.path, + prev->this->name, strerror (op_errno)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto cleanup; + } + + if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) { + link_frame = copy_frame (frame); + if (!link_frame) { + goto err; + } + + /* fop value sent as maxvalue because it is not used + anywhere in this case */ + link_local = dht_local_init (link_frame, &local->loc2, NULL, + GF_FOP_MAXVALUE); + if (!link_local) { + goto err; + } + + if (link_local->loc.inode) + inode_unref (link_local->loc.inode); + link_local->loc.inode = inode_ref (local->loc.inode); + uuid_copy (link_local->gfid, local->loc.inode->gfid); + + dht_linkfile_create (link_frame, dht_rename_links_create_cbk, + this, src_cached, dst_hashed, + &link_local->loc); + } + +err: + /* Merge attrs only from src_cached. In case there of src_cached != + * dst_hashed, this ignores linkfile attrs. */ + if (prev->this == src_cached) { + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preoldparent, preoldparent, + prev->this); + dht_iatt_merge (this, &local->postoldparent, postoldparent, + prev->this); + dht_iatt_merge (this, &local->preparent, prenewparent, + prev->this); + dht_iatt_merge (this, &local->postparent, postnewparent, + prev->this); + } + + + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk + * is called. since rename has already happened on rename_subvol, + * unlink should not be sent for oldpath (either linkfile or cached-file) + * on rename_subvol. */ + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + /* TODO: delete files in background */ + + if (src_cached != dst_hashed && src_cached != dst_cached) + local->call_cnt++; + + if (src_hashed != rename_subvol && src_hashed != src_cached) + local->call_cnt++; + + if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) + local->call_cnt++; + + if (local->call_cnt == 0) + goto unwind; + + DHT_MARK_FOP_INTERNAL (xattr); + + if (src_cached != dst_hashed && src_cached != dst_cached) { + dict_t *xattr_new = NULL; + + xattr_new = dict_copy_with_ref (xattr, NULL); + + gf_log (this->name, GF_LOG_TRACE, + "deleting old src datafile %s @ %s", + local->loc.path, src_cached->name); + + if (uuid_compare (local->loc.pargfid, + local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); + } + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_cached, src_cached->fops->unlink, + &local->loc, 0, xattr_new); + + dict_unref (xattr_new); + xattr_new = NULL; + } + + if (src_hashed != rename_subvol && src_hashed != src_cached) { + dict_t *xattr_new = NULL; + + xattr_new = dict_copy_with_ref (xattr, NULL); + + gf_log (this->name, GF_LOG_TRACE, + "deleting old src linkfile %s @ %s", + local->loc.path, src_hashed->name); + + DHT_MARKER_DONT_ACCOUNT(xattr_new); + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_hashed, src_hashed->fops->unlink, + &local->loc, 0, xattr_new); + + dict_unref (xattr_new); + xattr_new = NULL; + } + + if (dst_cached + && (dst_cached != dst_hashed) + && (dst_cached != src_cached)) { + gf_log (this->name, GF_LOG_TRACE, + "deleting old dst datafile %s @ %s", + local->loc2.path, dst_cached->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + dst_cached, dst_cached->fops->unlink, + &local->loc2, 0, xattr); + } + if (xattr) + dict_unref (xattr); + return 0; unwind: - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); + WIPE (&local->preoldparent); + WIPE (&local->postoldparent); + WIPE (&local->preparent); + WIPE (&local->postparent); + if (xattr) + dict_unref (xattr); - return 0; + dht_rename_done (frame, this); + + return 0; + +cleanup: + if (xattr) + dict_unref (xattr); + dht_rename_cleanup (frame); + + return 0; } int dht_do_rename (call_frame_t *frame) { - dht_local_t *local = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_cached = NULL; - xlator_t *this = NULL; - xlator_t *rename_subvol = NULL; + dht_local_t *local = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_cached = NULL; + xlator_t *this = NULL; + xlator_t *rename_subvol = NULL; + dict_t *dict = NULL; + + + local = frame->local; + this = frame->this; + + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + src_cached = local->src_cached; + + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + if ((src_cached != dst_hashed) && (rename_subvol == dst_hashed)) { + DHT_MARKER_DONT_ACCOUNT(dict); + } + + gf_log (this->name, GF_LOG_TRACE, + "renaming %s => %s (%s)", + local->loc.path, local->loc2.path, rename_subvol->name); + + if (local->linked == _gf_true) + FRAME_SU_DO (frame, dht_local_t); + STACK_WIND (frame, dht_rename_cbk, + rename_subvol, rename_subvol->fops->rename, + &local->loc, &local->loc2, dict); + + return 0; +} - local = frame->local; - this = frame->this; +int +dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; - src_cached = local->src_cached; - if (src_cached == dst_cached) - rename_subvol = src_cached; - else - rename_subvol = dst_hashed; + local = frame->local; + prev = cookie; - gf_log (this->name, GF_LOG_TRACE, - "renaming %s => %s (%s)", - local->loc.path, local->loc2.path, rename_subvol->name); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "link/file on %s failed (%s)", + prev->this->name, strerror (op_errno)); + local->op_ret = -1; + if (op_errno != ENOENT) + local->op_errno = op_errno; + } else if (local->src_cached == prev->this) { + /* merge of attr returned only from linkfile creation */ + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + } - STACK_WIND (frame, dht_rename_cbk, - rename_subvol, rename_subvol->fops->rename, - &local->loc, &local->loc2); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->op_ret == -1) + goto cleanup; - return 0; + dht_do_rename (frame); + } + + return 0; + +cleanup: + dht_rename_cleanup (frame); + + return 0; } int -dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *stbuf) +dht_rename_unlink_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; - int this_call_cnt = 0; local = frame->local; prev = cookie; - - if (op_ret == -1) { + + if ((op_ret == -1) && (op_errno != ENOENT)) { gf_log (this->name, GF_LOG_DEBUG, - "link/file on %s failed (%s)", - prev->this->name, strerror (op_errno)); + "unlink of %s on %s failed (%s)", + local->loc2.path, prev->this->name, + strerror (op_errno)); local->op_ret = -1; local->op_errno = op_errno; } - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->op_ret == -1) - goto unwind; - - dht_do_rename (frame); - } + if (local->op_ret == -1) + goto cleanup; + + dht_do_rename (frame); return 0; -unwind: - DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); +cleanup: + dht_rename_cleanup (frame); return 0; } @@ -408,155 +827,183 @@ unwind: int dht_rename_create_links (call_frame_t *frame) { - dht_local_t *local = NULL; - xlator_t *this = NULL; - xlator_t *src_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *dst_cached = NULL; - int call_cnt = 0; + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + dict_t *xattr = NULL; - local = frame->local; - this = frame->this; + local = frame->local; + this = frame->this; - src_hashed = local->src_hashed; - src_cached = local->src_cached; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; - if (src_cached == dst_cached) - goto nolinks; + DHT_MARK_FOP_INTERNAL (xattr); - if (dst_hashed != src_hashed && dst_hashed != src_cached) - call_cnt++; + if (src_cached == dst_cached) { + dict_t *xattr_new = NULL; - if (src_cached != dst_hashed) - call_cnt++; + if (dst_hashed == dst_cached) + goto nolinks; - local->call_cnt = call_cnt; + xattr_new = dict_copy_with_ref (xattr, NULL); - if (dst_hashed != src_hashed && dst_hashed != src_cached) { gf_log (this->name, GF_LOG_TRACE, - "linkfile %s @ %s => %s", - local->loc.path, dst_hashed->name, src_cached->name); - dht_linkfile_create (frame, dht_rename_links_cbk, + "unlinking dst linkfile %s @ %s", + local->loc2.path, dst_hashed->name); + + DHT_MARKER_DONT_ACCOUNT(xattr_new); + + STACK_WIND (frame, dht_rename_unlink_links_cbk, + dst_hashed, dst_hashed->fops->unlink, + &local->loc2, 0, xattr_new); + + dict_unref (xattr_new); + return 0; + } + + if (dst_hashed != src_hashed && dst_hashed != src_cached) + call_cnt++; + + if (src_cached != dst_hashed) + call_cnt++; + + local->call_cnt = call_cnt; + + if (dst_hashed != src_hashed && dst_hashed != src_cached) { + gf_log (this->name, GF_LOG_TRACE, + "linkfile %s @ %s => %s", + local->loc.path, dst_hashed->name, src_cached->name); + memcpy (local->gfid, local->loc.inode->gfid, 16); + dht_linkfile_create (frame, dht_rename_links_cbk, this, src_cached, dst_hashed, &local->loc); } if (src_cached != dst_hashed) { + dict_t *xattr_new = NULL; + + xattr_new = dict_copy_with_ref (xattr, NULL); + gf_log (this->name, GF_LOG_TRACE, "link %s => %s (%s)", local->loc.path, local->loc2.path, src_cached->name); + if (uuid_compare (local->loc.pargfid, + local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); + } + STACK_WIND (frame, dht_rename_links_cbk, src_cached, src_cached->fops->link, - &local->loc, &local->loc2); - } + &local->loc, &local->loc2, xattr_new); -nolinks: - if (!call_cnt) { - /* skip to next step */ - dht_do_rename (frame); + dict_unref (xattr_new); } - return 0; +nolinks: + if (!call_cnt) { + /* skip to next step */ + dht_do_rename (frame); + } + if (xattr) + dict_unref (xattr); + + return 0; } int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - xlator_t *src_cached = NULL; - xlator_t *src_hashed = NULL; - xlator_t *dst_cached = NULL; - xlator_t *dst_hashed = NULL; - int op_errno = -1; - int ret = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (newloc, err); - - src_hashed = dht_subvol_get_hashed (this, oldloc); - if (!src_hashed) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - oldloc->path); - op_errno = EINVAL; - goto err; - } - - src_cached = dht_subvol_get_cached (this, oldloc->inode); - if (!src_cached) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", oldloc->path); - op_errno = EINVAL; - goto err; - } - - dst_hashed = dht_subvol_get_hashed (this, newloc); - if (!dst_hashed) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - newloc->path); - op_errno = EINVAL; - goto err; - } - - if (newloc->inode) - dst_cached = dht_subvol_get_cached (this, newloc->inode); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - ret = loc_copy (&local->loc, oldloc); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - ret = loc_copy (&local->loc2, newloc); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - local->src_hashed = src_hashed; - local->src_cached = src_cached; - local->dst_hashed = dst_hashed; - local->dst_cached = dst_cached; - - gf_log (this->name, GF_LOG_TRACE, - "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", - oldloc->path, src_hashed->name, src_cached->name, - newloc->path, dst_hashed->name, - dst_cached ? dst_cached->name : "<nul>"); - - if (S_ISDIR (oldloc->inode->st_mode)) { - dht_rename_dir (frame, this); - } else { - local->op_ret = 0; - dht_rename_create_links (frame); - } - - return 0; + xlator_t *src_cached = NULL; + xlator_t *src_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *dst_hashed = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (newloc, err); + + src_hashed = dht_subvol_get_hashed (this, oldloc); + if (!src_hashed) { + gf_log (this->name, GF_LOG_INFO, + "no subvolume in layout for path=%s", + oldloc->path); + op_errno = EINVAL; + goto err; + } + + src_cached = dht_subvol_get_cached (this, oldloc->inode); + if (!src_cached) { + gf_log (this->name, GF_LOG_INFO, + "no cached subvolume for path=%s", oldloc->path); + op_errno = EINVAL; + goto err; + } + + dst_hashed = dht_subvol_get_hashed (this, newloc); + if (!dst_hashed) { + gf_log (this->name, GF_LOG_INFO, + "no subvolume in layout for path=%s", + newloc->path); + op_errno = EINVAL; + goto err; + } + + if (newloc->inode) + dst_cached = dht_subvol_get_cached (this, newloc->inode); + + local = dht_local_init (frame, oldloc, NULL, GF_FOP_RENAME); + if (!local) { + op_errno = ENOMEM; + goto err; + } + /* cached_subvol will be set from dht_local_init, reset it to NULL, + as the logic of handling rename is different */ + local->cached_subvol = NULL; + + ret = loc_copy (&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + goto err; + } + + local->src_hashed = src_hashed; + local->src_cached = src_cached; + local->dst_hashed = dst_hashed; + local->dst_cached = dst_cached; + + gf_log (this->name, GF_LOG_TRACE, + "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", + oldloc->path, src_hashed->name, src_cached->name, + newloc->path, dst_hashed->name, + dst_cached ? dst_cached->name : "<nul>"); + + if (IA_ISDIR (oldloc->inode->ia_type)) { + dht_rename_dir (frame, this); + } else { + local->op_ret = 0; + dht_rename_create_links (frame); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); - return 0; + return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index df8b2047b..0e6527544 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -26,459 +17,1032 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" +#include "glusterfs-acl.h" + +#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ + layout->list[i].start = srt; \ + layout->list[i].stop = srt + chunk - 1; \ + \ + gf_log (this->name, GF_LOG_TRACE, \ + "gave fix: %u - %u on %s for %s", \ + layout->list[i].start, layout->list[i].stop, \ + layout->list[i].xlator->name, path); \ + } while (0) + +#define DHT_RESET_LAYOUT_RANGE(layout) do { \ + int cnt = 0; \ + for (cnt = 0; cnt < layout->cnt; cnt++ ) { \ + layout->list[cnt].start = 0; \ + layout->list[cnt].stop = 0; \ + } \ + } while (0) + +static uint32_t +dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) +{ + if (o >= old->cnt || n >= new->cnt) + return 0; + + if (old->list[o].err > 0 || new->list[n].err > 0) + return 0; + + if (old->list[o].start == old->list[o].stop) { + return 0; + } + + if (new->list[n].start == new->list[n].stop) { + return 0; + } + + if ((old->list[o].start > new->list[n].stop) || + (old->list[o].stop < new->list[n].start)) + return 0; + + return min (old->list[o].stop, new->list[n].stop) - + max (old->list[o].start, new->list[n].start) + 1; +} int dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) { - dht_local_t *local = NULL; - + dht_local_t *local = NULL; - local = frame->local; - local->selfheal.dir_cbk (frame, NULL, frame->this, ret, - local->op_errno); + local = frame->local; + local->selfheal.dir_cbk (frame, NULL, frame->this, ret, + local->op_errno, NULL); - return 0; + return 0; } int dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - int i = 0; - dht_layout_t *layout = NULL; - int err = 0; - int this_call_cnt = 0; - - local = frame->local; - layout = local->selfheal.layout; - prev = cookie; - subvol = prev->this; - - if (op_ret == 0) - err = 0; - else - err = op_errno; - - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == subvol) { - layout->list[i].err = err; - break; - } - } + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int err = 0; + int this_call_cnt = 0; + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev->this; + + if (op_ret == 0) + err = 0; + else + err = op_errno; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = err; + break; + } + } - this_call_cnt = dht_frame_return (frame); + this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_selfheal_dir_finish (frame, this, 0); - } + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_finish (frame, this, 0); + } - return 0; + return 0; } int dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int i) + dht_layout_t *layout, int i, + xlator_t *req_subvol) { - xlator_t *subvol = NULL; - dict_t *xattr = NULL; - int ret = 0; - xlator_t *this = NULL; - int32_t *disk_layout = NULL; + xlator_t *subvol = NULL; + dict_t *xattr = NULL; + int ret = 0; + xlator_t *this = NULL; + int32_t *disk_layout = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + data_t *data = NULL; + + local = frame->local; + if (req_subvol) + subvol = req_subvol; + else + subvol = layout->list[i].xlator; + this = frame->this; + + GF_VALIDATE_OR_GOTO ("", this, err); + GF_VALIDATE_OR_GOTO (this->name, layout, err); + GF_VALIDATE_OR_GOTO (this->name, local, err); + GF_VALIDATE_OR_GOTO (this->name, subvol, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + + xattr = get_new_dict (); + if (!xattr) { + goto err; + } + ret = dht_disk_layout_extract (this, layout, i, &disk_layout); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s: (subvol %s) failed to extract disk layout", + loc->path, subvol->name); + goto err; + } - subvol = layout->list[i].xlator; - this = frame->this; + ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s: (subvol %s) failed to set xattr dictionary", + loc->path, subvol->name); + goto err; + } + disk_layout = NULL; + + gf_log (this->name, GF_LOG_TRACE, + "setting hash range %u - %u (type %d) on subvolume %s for %s", + layout->list[i].start, layout->list[i].stop, + layout->type, subvol->name, loc->path); + + dict_ref (xattr); + if (local->xattr) { + data = dict_get (local->xattr, QUOTA_LIMIT_KEY); + if (data) { + ret = dict_add (xattr, QUOTA_LIMIT_KEY, data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "set quota limit key on %s",loc->path); + } + } + } + if (!uuid_is_null (local->gfid)) + uuid_copy (loc->gfid, local->gfid); - xattr = get_new_dict (); - if (!xattr) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, + subvol, subvol->fops->setxattr, + loc, xattr, 0, NULL); - ret = dht_disk_layout_extract (this, layout, i, &disk_layout); - if (ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to extract disk layout"); - goto err; - } + dict_unref (xattr); - ret = dict_set_bin (xattr, "trusted.glusterfs.dht", - disk_layout, 4 * 4); - if (ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to set xattr dictionary"); - goto err; - } - disk_layout = NULL; + return 0; - gf_log (this->name, GF_LOG_TRACE, - "setting hash range %u - %u (type %d) on subvolume %s for %s", - layout->list[i].start, layout->list[i].stop, - layout->type, subvol->name, loc->path); +err: + if (xattr) + dict_destroy (xattr); - dict_ref (xattr); + GF_FREE (disk_layout); - STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, - subvol, subvol->fops->setxattr, - loc, xattr, 0); + dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, + -1, ENOMEM, NULL); + return 0; +} - dict_unref (xattr); +int +dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int i = 0; + int count = 0; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; - return 0; + local = frame->local; + this = frame->this; + conf = this->private; -err: - if (xattr) - dict_destroy (xattr); + gf_log (this->name, GF_LOG_DEBUG, + "writing the new range for all subvolumes"); - if (disk_layout) - FREE (disk_layout); + local->call_cnt = count = conf->subvolume_cnt; - dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, - -1, ENOMEM); - return 0; -} + for (i = 0; i < layout->cnt; i++) { + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); + if (--count == 0) + goto out; + } + /* if we are here, subvolcount > layout_count. subvols-per-directory + * option might be set here. We need to clear out layout from the + * non-participating subvolumes, else it will result in overlaps */ + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + if (--count == 0) + break; + } + } + + dht_layout_unref (this, dummy); +out: + return 0; +} int dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { - dht_local_t *local = NULL; - int missing_xattr = 0; - int i = 0; - int ret = 0; - xlator_t *this = NULL; - - local = frame->local; - this = frame->this; - - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err != -1 || !layout->list[i].stop) { - /* err != -1 would mean xattr present on the directory - * or the directory is itself non existant. - * !layout->list[i].stop would mean layout absent - */ - continue; - } - missing_xattr++; - } + dht_local_t *local = NULL; + int missing_xattr = 0; + int i = 0; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; + + local = frame->local; + this = frame->this; + conf = this->private; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) { + /* err != -1 would mean xattr present on the directory + * or the directory is non existent. + * !layout->list[i].stop would mean layout absent + */ + + continue; + } + missing_xattr++; + } + /* Also account for subvolumes with no-layout. Used for zero'ing out + * the layouts and for setting quota key's if present */ + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + missing_xattr++; + } + } + gf_log (this->name, GF_LOG_TRACE, + "%d subvolumes missing xattr for %s", + missing_xattr, loc->path); - gf_log (this->name, GF_LOG_TRACE, - "%d subvolumes missing xattr for %s", - missing_xattr, loc->path); + if (missing_xattr == 0) { + dht_selfheal_dir_finish (frame, this, 0); + return 0; + } - if (missing_xattr == 0) { - dht_selfheal_dir_finish (frame, this, 0); - return 0; - } + local->call_cnt = missing_xattr; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; - local->call_cnt = missing_xattr; + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err != -1 || !layout->list[i].stop) - continue; + if (--missing_xattr == 0) + break; + } + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + missing_xattr--; + } + } - ret = dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + dht_layout_unref (this, dummy); +out: + return 0; +} - if (--missing_xattr == 0) - break; - } - return 0; +int +dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int this_call_cnt = 0; + + local = frame->local; + layout = local->selfheal.layout; + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_xattr (frame, &local->loc, layout); + } + + return 0; } int +dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, + int32_t valid, dht_layout_t *layout) +{ + int missing_attr = 0; + int i = 0; + dht_local_t *local = NULL; + xlator_t *this = NULL; + + local = frame->local; + this = frame->this; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == -1) + missing_attr++; + } + + if (missing_attr == 0) { + dht_selfheal_dir_xattr (frame, loc, layout); + return 0; + } + + if (!uuid_is_null (local->gfid)) + uuid_copy (loc->gfid, local->gfid); + + local->call_cnt = missing_attr; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == -1) { + gf_log (this->name, GF_LOG_TRACE, + "setattr for %s on subvol %s", + loc->path, layout->list[i].xlator->name); + + STACK_WIND (frame, dht_selfheal_dir_setattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->setattr, + loc, stbuf, valid, NULL); + } + } + + return 0; +} + +int dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf) + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - int i = 0; - int this_call_cnt = 0; - - - local = frame->local; - layout = local->selfheal.layout; - prev = cookie; - subvol = prev->this; - - if ((op_ret == 0) || (op_errno == EEXIST)) { - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == subvol) { - layout->list[i].err = -1; - break; - } - } - } + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0; + int this_call_cnt = 0; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_selfheal_dir_xattr (frame, &local->loc, layout); - } + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev->this; - return 0; + if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) { + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = -1; + break; + } + } + } + + if (op_ret) { + gf_log (this->name, ((op_errno == EEXIST) ? GF_LOG_DEBUG : + GF_LOG_WARNING), + "selfhealing directory %s failed: %s", + local->loc.path, strerror (op_errno)); + goto out; + } + + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preparent, preparent, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, prev->this); + +out: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_setattr (frame, &local->loc, &local->stbuf, 0xffffff, layout); + } + + return 0; } +void +dht_selfheal_dir_mkdir_setacl (dict_t *xattr, dict_t *dict) +{ + data_t *acl_default = NULL; + data_t *acl_access = NULL; + xlator_t *this = NULL; + int ret = -1; + + GF_ASSERT (xattr); + GF_ASSERT (dict); + + this = THIS; + GF_ASSERT (this); + + acl_default = dict_get (xattr, POSIX_ACL_DEFAULT_XATTR); + + if (!acl_default) { + gf_log (this->name, GF_LOG_DEBUG, + "ACL_DEFAULT xattr not present"); + goto cont; + } + ret = dict_set (dict, POSIX_ACL_DEFAULT_XATTR, acl_default); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not set ACL_DEFAULT xattr"); +cont: + acl_access = dict_get (xattr, POSIX_ACL_ACCESS_XATTR); + if (!acl_access) { + gf_log (this->name, GF_LOG_DEBUG, + "ACL_ACCESS xattr not present"); + goto out; + } + ret = dict_set (dict, POSIX_ACL_ACCESS_XATTR, acl_access); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not set ACL_ACCESS xattr"); + +out: + return; +} int dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int force) + dht_layout_t *layout, int force) { - int missing_dirs = 0; - int i = 0; - dht_local_t *local = NULL; - xlator_t *this = NULL; + int missing_dirs = 0; + int i = 0; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *this = NULL; + dict_t *dict = NULL; + + local = frame->local; + this = frame->this; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) + missing_dirs++; + } + if (missing_dirs == 0) { + dht_selfheal_dir_setattr (frame, loc, &local->stbuf, 0xffffffff, layout); + return 0; + } - local = frame->local; - this = frame->this; + local->call_cnt = missing_dirs; + if (!uuid_is_null (local->gfid)) { + dict = dict_new (); + if (!dict) + return -1; + + ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set gfid in dict", loc->path); + } else if (local->params) { + /* Send the dictionary from higher layers directly */ + dict = dict_ref (local->params); + } + /* Set acls */ + if (local->xattr && dict) + dht_selfheal_dir_mkdir_setacl (local->xattr, dict); + + if (!dict) + gf_log (this->name, GF_LOG_WARNING, + "dict is NULL, need to make sure gfids are same"); + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) { + gf_log (this->name, GF_LOG_DEBUG, + "creating directory %s on subvol %s", + loc->path, layout->list[i].xlator->name); + + STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->mkdir, + loc, + st_mode_from_ia (local->stbuf.ia_prot, + local->stbuf.ia_type), + 0, dict); + } + } - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == ENOENT || force) - missing_dirs++; - } + if (dict) + dict_unref (dict); - if (missing_dirs == 0) { - dht_selfheal_dir_xattr (frame, loc, layout); - return 0; - } + return 0; +} - local->call_cnt = missing_dirs; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == ENOENT || force) { - gf_log (this->name, GF_LOG_TRACE, - "creating directory %s on subvol %s", - loc->path, layout->list[i].xlator->name); - - STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->mkdir, - loc, local->stbuf.st_mode); - } - } - return 0; +int +dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, + dht_layout_t *layout) +{ + int start = 0; + uint32_t hashval = 0; + int ret = 0; + + ret = dht_hash_compute (this, layout->type, loc->path, &hashval); + if (ret == 0) { + start = (hashval % layout->cnt); + } + + return start; } -void -dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout) +static inline int +dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) { - dht_conf_t *conf = NULL; - xlator_t *this = NULL; - uint32_t chunk = 0; - int i = 0; - uint32_t start = 0; - int cnt = 0; - int err = 0; - - this = frame->this; - conf = this->private; - - for (i = 0; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; - cnt++; - } - } + int i = 0; + int j = 0; + int err = 0; + int count = 0; + dht_conf_t *conf = NULL; + + /* Gets in use only for replace-brick, remove-brick */ + conf = this->private; + for (i = 0; i < layout->cnt; i++) { + for (j = 0; j < conf->subvolume_cnt; j++) { + if (conf->decommissioned_bricks[j] && + conf->decommissioned_bricks[j] == layout->list[i].xlator) { + layout->list[i].err = EINVAL; + break; + } + } + } + + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ + count++; + if (!err) + layout->list[i].err = -1; + } + } /* no subvolume has enough space, but can't stop directory creation */ - if (!cnt) { + if (!count || !new_layout) { for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; if (err == ENOSPC) { layout->list[i].err = -1; - cnt++; + count++; } } } - chunk = ((unsigned long) 0xffffffff) / cnt; - - start = 0; - for (i = 0; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == -1) { - layout->list[i].start = start; - layout->list[i].stop = start + chunk - 1; - - start = start + chunk; - - gf_log (this->name, GF_LOG_TRACE, - "gave fix: %u - %u on %s for %s", - layout->list[i].start, layout->list[i].stop, - layout->list[i].xlator->name, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - break; - } - } + /* if layout->spread_cnt is set, check if it is <= available + * subvolumes (down brick and decommissioned bricks are considered + * un-availbale). Else return count (available up bricks) */ + count = ((layout->spread_cnt && + (layout->spread_cnt <= count)) ? + layout->spread_cnt : ((count) ? count : 1)); + + return count; +} + + +void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); + +void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); +void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x,y) table[x*new->cnt+y] + +void +dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, + dht_layout_t *new, dht_layout_t *old) +{ + int i = 0; + int j = 0; + uint32_t curr_overlap = 0; + uint32_t max_overlap = 0; + int max_overlap_idx = -1; + uint32_t overlap = 0; + uint32_t *table = NULL; + + dht_layout_sort_volname (old); + /* Now both old_layout->list[] and new_layout->list[] + are match the same xlators/subvolumes. i.e, + old_layout->[i] and new_layout->[i] are referring + to the same subvolumes + */ + + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap)*old->cnt*new->cnt); + if (!table) { + return; + } + memset(table,0,sizeof(overlap)*old->cnt*new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); + } + } + + for (i = 0; i < new->cnt; i++) { + if (new->list[i].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + if (new->list[j].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; + } + } + } + + if (max_overlap_idx != i) { + dht_layout_range_swap (new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i,j); + OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); + OV_ENTRY(max_overlap_idx,j) = overlap; + } + } } } +dht_layout_t * +dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + int i = 0; + xlator_t *this = NULL; + dht_layout_t *new_layout = NULL; + dht_conf_t *priv = NULL; + dht_local_t *local = NULL; + uint32_t subvol_down = 0; + int ret = 0; + + this = frame->this; + priv = this->private; + local = frame->local; + + if (layout->type == DHT_HASH_TYPE_DM_USER) { + gf_log (THIS->name, GF_LOG_DEBUG, "leaving %s alone", + loc->path); + goto done; + } + + new_layout = dht_layout_new (this, priv->subvolume_cnt); + if (!new_layout) + goto done; + + /* If a subvolume is down, do not re-write the layout. */ + ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL, + &subvol_down, NULL, NULL); + + if (subvol_down || (ret == -1)) { + gf_log (this->name, GF_LOG_WARNING, "%u subvolume(s) are down" + ". Skipping fix layout.", subvol_down); + GF_FREE (new_layout); + return NULL; + } + + for (i = 0; i < new_layout->cnt; i++) { + if (layout->list[i].err != ENOSPC) + new_layout->list[i].err = layout->list[i].err; + else + new_layout->list[i].err = -1; + + new_layout->list[i].xlator = layout->list[i].xlator; + } + + /* First give it a layout as though it is a new directory. This + ensures rotation to kick in */ + dht_layout_sort_volname (new_layout); + dht_selfheal_layout_new_directory (frame, loc, new_layout); + + /* Now selectively re-assign ranges only when it helps */ + dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout); + +done: + if (new_layout) { + /* Now that the new layout has all the proper layout, change the + inode context */ + dht_layout_set (this, loc->inode, new_layout); + + /* Make sure the extra 'ref' for existing layout is removed */ + dht_layout_unref (this, local->layout); + + local->layout = new_layout; + } + + return local->layout; +} + + +void +dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + xlator_t *this = NULL; + uint32_t chunk = 0; + int i = 0; + uint32_t start = 0; + int cnt = 0; + int err = 0; + int start_subvol = 0; + + this = frame->this; + + cnt = dht_get_layout_count (this, layout, 1); + + chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); + + start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); + + /* clear out the range, as we are re-computing here */ + DHT_RESET_LAYOUT_RANGE (layout); + for (i = start_subvol; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1 || err == ENOENT) { + DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, + cnt, loc->path); + if (--cnt == 0) { + layout->list[i].stop = 0xffffffff; + goto done; + } + start += chunk; + } + } + + for (i = 0; i < start_subvol; i++) { + err = layout->list[i].err; + if (err == -1 || err == ENOENT) { + DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, + cnt, loc->path); + if (--cnt == 0) { + layout->list[i].stop = 0xffffffff; + goto done; + } + start += chunk; + } + } + +done: + return; +} + int dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout) + dht_layout_t *layout) { - dht_conf_t *conf = NULL; - xlator_t *this = NULL; - dht_local_t *local = NULL; - int missing = -1; - int down = -1; - int holes = -1; - int ret = -1; - int i = -1; - int overlaps = -1; - - this = frame->this; - conf = this->private; - local = frame->local; - - missing = local->selfheal.missing; - down = local->selfheal.down; - holes = local->selfheal.hole_cnt; - overlaps = local->selfheal.overlaps_cnt; - - if ((missing + down) == conf->subvolume_cnt) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; - } + dht_local_t *local = NULL; + uint32_t holes = 0; + int ret = -1; + int i = -1; + uint32_t overlaps = 0; - if (holes <= down) { - /* the down subvol might fill up the holes */ - ret = 0; - } + local = frame->local; - if (holes || missing || overlaps) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; - } + holes = local->selfheal.hole_cnt; + overlaps = local->selfheal.overlaps_cnt; - for (i = 0; i < layout->cnt; i++) { - /* directory not present */ - if (layout->list[i].err == ENOENT) { - ret = 0; - break; - } - } + if (holes || overlaps) { + dht_selfheal_layout_new_directory (frame, loc, layout); + ret = 0; + } - /* TODO: give a fix to these non-virgins */ + for (i = 0; i < layout->cnt; i++) { + /* directory not present */ + if (layout->list[i].err == ENOENT) { + ret = 0; + break; + } + } + + /* TODO: give a fix to these non-virgins */ - return ret; + return ret; } int -dht_selfheal_new_directory (call_frame_t *frame, - dht_selfheal_dir_cbk_t dir_cbk, - dht_layout_t *layout) +dht_selfheal_new_directory (call_frame_t *frame, + dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; + local = frame->local; - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = layout; + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref (frame->this, layout); - dht_layout_sort_volname (layout); - dht_selfheal_layout_new_directory (frame, &local->loc, layout); - dht_selfheal_dir_xattr (frame, &local->loc, layout); - return 0; + dht_layout_sort_volname (layout); + dht_selfheal_layout_new_directory (frame, &local->loc, layout); + dht_selfheal_dir_xattr (frame, &local->loc, layout); + return 0; +} + +int +dht_fix_directory_layout (call_frame_t *frame, + dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout) +{ + dht_local_t *local = NULL; + dht_layout_t *tmp_layout = NULL; + + local = frame->local; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref (frame->this, layout); + + /* No layout sorting required here */ + tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout); + if (!tmp_layout) { + return -1; + } + dht_fix_dir_xattr (frame, &local->loc, tmp_layout); + + return 0; } int dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, - loc_t *loc, dht_layout_t *layout) + loc_t *loc, dht_layout_t *layout) { - dht_local_t *local = NULL; - uint32_t holes = 0; - uint32_t overlaps = 0; - uint32_t missing = 0; - uint32_t down = 0; - uint32_t misc = 0; - int ret = 0; - xlator_t *this = NULL; - - local = frame->local; - this = frame->this; - - ret = dht_layout_anomalies (this, loc, layout, - &local->selfheal.hole_cnt, - &local->selfheal.overlaps_cnt, - &local->selfheal.missing, - &local->selfheal.down, - &local->selfheal.misc); - - holes = local->selfheal.hole_cnt; - overlaps = local->selfheal.overlaps_cnt; - missing = local->selfheal.missing; - down = local->selfheal.down; - misc = local->selfheal.misc; - - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = layout; - - if (down) { - gf_log (this->name, GF_LOG_DEBUG, - "%d subvolumes down -- not fixing", down); - ret = 0; - goto sorry_no_fix; - } + dht_local_t *local = NULL; + uint32_t down = 0; + uint32_t misc = 0; + int ret = 0; + xlator_t *this = NULL; + + local = frame->local; + this = frame->this; + + dht_layout_anomalies (this, loc, layout, + &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, + NULL, &local->selfheal.down, + &local->selfheal.misc, NULL); + + down = local->selfheal.down; + misc = local->selfheal.misc; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref (this, layout); + + if (down) { + gf_log (this->name, GF_LOG_WARNING, + "%d subvolumes down -- not fixing", down); + ret = 0; + goto sorry_no_fix; + } - if (misc) { - gf_log (this->name, GF_LOG_DEBUG, - "%d subvolumes have unrecoverable errors", misc); - ret = 0; - goto sorry_no_fix; - } + if (misc) { + gf_log (this->name, GF_LOG_WARNING, + "%d subvolumes have unrecoverable errors", misc); + ret = 0; + goto sorry_no_fix; + } - dht_layout_sort_volname (layout); - ret = dht_selfheal_dir_getafix (frame, loc, layout); + dht_layout_sort_volname (layout); + ret = dht_selfheal_dir_getafix (frame, loc, layout); - if (ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "not able to form layout for the directory"); - goto sorry_no_fix; - } + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "not able to form layout for the directory"); + goto sorry_no_fix; + } - dht_selfheal_dir_mkdir (frame, loc, layout, 0); + dht_selfheal_dir_mkdir (frame, loc, layout, 0); - return 0; + return 0; sorry_no_fix: - /* TODO: need to put appropriate local->op_errno */ - dht_selfheal_dir_finish (frame, this, ret); + /* TODO: need to put appropriate local->op_errno */ + dht_selfheal_dir_finish (frame, this, ret); - return 0; + return 0; } int dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, - loc_t *loc, dht_layout_t *layout) + loc_t *loc, dht_layout_t *layout) { - int ret = 0; - dht_local_t *local = NULL; + int ret = 0; + dht_local_t *local = NULL; + local = frame->local; - local = frame->local; + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref (frame->this, layout); - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = layout; + ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1); - ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1); + return ret; +} - return 0; +int +dht_dir_attr_heal (void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int ret = -1; + int i = 0; + + GF_VALIDATE_OR_GOTO ("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", local, out); + conf = this->private; + GF_VALIDATE_OR_GOTO ("dht", conf, out); + + call_cnt = conf->subvolume_cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (!subvol || (subvol == dht_first_up_subvol (this))) + continue; + ret = syncop_setattr (subvol, &local->loc, &local->stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret) { + gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on" + " %s on %s subvol (%s)", local->loc.path, + subvol->name, strerror (-ret)); + } + } +out: + return 0; +} + +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY (sync_frame); + return 0; } diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c new file mode 100644 index 000000000..f2e7467ab --- /dev/null +++ b/xlators/cluster/dht/src/dht-shared.c @@ -0,0 +1,780 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "statedump.h" +#include "dht-common.h" + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ +struct volume_options options[]; + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix) +{ + + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + if (!layout) + goto out; + if (!prefix) + goto out; + + gf_proc_dump_build_key(key, prefix, "cnt"); + gf_proc_dump_write(key, "%d", layout->cnt); + gf_proc_dump_build_key(key, prefix, "preset"); + gf_proc_dump_write(key, "%d", layout->preset); + gf_proc_dump_build_key(key, prefix, "gen"); + gf_proc_dump_write(key, "%d", layout->gen); + if (layout->type != IA_INVAL) { + gf_proc_dump_build_key(key, prefix, "inode type"); + gf_proc_dump_write(key, "%d", layout->type); + } + + if (!IA_ISDIR (layout->type)) + goto out; + + for (i = 0; i < layout->cnt; i++) { + gf_proc_dump_build_key(key, prefix,"list[%d].err", i); + gf_proc_dump_write(key, "%d", layout->list[i].err); + gf_proc_dump_build_key(key, prefix,"list[%d].start", i); + gf_proc_dump_write(key, "%u", layout->list[i].start); + gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); + gf_proc_dump_write(key, "%u", layout->list[i].stop); + if (layout->list[i].xlator) { + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.type", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->type); + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.name", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->name); + } + } + +out: + return; +} + + +int32_t +dht_priv_dump (xlator_t *this) +{ + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + dht_conf_t *conf = NULL; + int ret = -1; + + if (!this) + goto out; + + conf = this->private; + if (!conf) + goto out; + + ret = TRY_LOCK(&conf->subvolume_lock); + if (ret != 0) { + return ret; + } + + gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); + gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", + this->name); + gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + snprintf (key, sizeof (key), "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, + conf->subvolumes[i]->name); + if (conf->file_layouts && conf->file_layouts[i]){ + snprintf (key, sizeof (key), "file_layouts[%d]", i); + dht_layout_dump(conf->file_layouts[i], key); + } + if (conf->dir_layouts && conf->dir_layouts[i]) { + snprintf (key, sizeof (key), "dir_layouts[%d]", i); + dht_layout_dump(conf->dir_layouts[i], key); + } + if (conf->subvolume_status) { + + snprintf (key, sizeof (key), "subvolume_status[%d]", i); + gf_proc_dump_write(key, "%d", + (int)conf->subvolume_status[i]); + } + + } + + gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); + gf_proc_dump_write("gen", "%d", conf->gen); + gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); + gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); + gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); + + if (conf->du_stats) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) + continue; + + snprintf (key, sizeof (key), "subvolumes[%d]", i); + gf_proc_dump_write (key, "%s", + conf->subvolumes[i]->name); + + snprintf (key, sizeof (key), + "du_stats[%d].avail_percent", i); + gf_proc_dump_write (key, "%lf", + conf->du_stats[i].avail_percent); + + snprintf (key, sizeof (key), "du_stats[%d].avail_space", + i); + gf_proc_dump_write (key, "%lu", + conf->du_stats[i].avail_space); + + snprintf (key, sizeof (key), + "du_stats[%d].avail_inodes", i); + gf_proc_dump_write (key, "%lf", + conf->du_stats[i].avail_inodes); + + snprintf (key, sizeof (key), "du_stats[%d].log", i); + gf_proc_dump_write (key, "%lu", + conf->du_stats[i].log); + } + } + + if (conf->last_stat_fetch.tv_sec) + gf_proc_dump_write("last_stat_fetch", "%s", + ctime(&conf->last_stat_fetch.tv_sec)); + + UNLOCK(&conf->subvolume_lock); + +out: + return ret; +} + +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode) +{ + int ret = -1; + dht_layout_t *layout = NULL; + + if (!this) + goto out; + if (!inode) + goto out; + + ret = dht_inode_ctx_layout_get (inode, this, &layout); + + if ((ret != 0) || !layout) + return ret; + + gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); + dht_layout_dump(layout, "layout"); + +out: + return ret; +} + +void +dht_fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + conf = this->private; + this->private = NULL; + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf); + } +out: + return; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } +out: + return ret; +} + + +int +dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup (bricks); + node = strtok_r (dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp (conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = + conf->subvolumes[i]; + conf->decommission_subvols_cnt++; + gf_log (this->name, GF_LOG_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r (NULL, ",", &tmpstr); + } + + ret = 0; + conf->decommission_in_progress = 1; +out: + GF_FREE (dup_brick); + + return ret; +} + + +int +dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + int ret = -1; + + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } + + ret = 0; +out: + + return ret; +} +void +dht_init_regex (xlator_t *this, dict_t *odict, char *name, + regex_t *re, gf_boolean_t *re_valid) +{ + char *temp_str; + + if (dict_get_str (odict, name, &temp_str) != 0) { + if (strcmp(name,"rsync-hash-regex")) { + return; + } + temp_str = "^\\.(.+)\\.[^.]+$"; + } + + if (*re_valid) { + regfree(re); + *re_valid = _gf_false; + } + + if (!strcmp(temp_str,"none")) { + return; + } + + if (regcomp(re,temp_str,REG_EXTENDED) == 0) { + gf_log (this->name, GF_LOG_INFO, + "using regex %s = %s", name, temp_str); + *re_valid = _gf_true; + } + else { + gf_log (this->name, GF_LOG_WARNING, + "compiling regex %s failed", temp_str); + } +} + +int +dht_reconfigure (xlator_t *this, dict_t *options) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + gf_boolean_t search_unhashed; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", options, out); + + conf = this->private; + if (!conf) + return 0; + + if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean*/ + if (strcasecmp (temp_str, "auto")) { + if (!gf_string2boolean (temp_str, &search_unhashed)) { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured (%s)", + temp_str); + conf->search_unhashed = search_unhashed; + } else { + gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" + " lookup-unhashed should be boolean," + " not (%s), defaulting to (%d)", + temp_str, conf->search_unhashed); + ret = -1; + goto out; + } + } else { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured auto "); + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, + percent_or_size, out); + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100.0) + conf->disk_unit = 'p'; + + GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, + percent, out); + + GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, + options, uint32, out); + + GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options, + bool, out); + if (conf->defrag) { + GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats, + options, bool, out); + } + + if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto out; + } else { + ret = dht_decommissioned_remove (this, conf); + if (ret == -1) + goto out; + } + + dht_init_regex (this, options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = 0; +out: + return ret; +} + +static int +gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *num = NULL; + char *pattern_str = NULL; + char *pattern = NULL; + gf_defrag_pattern_list_t *temp_list = NULL; + gf_defrag_pattern_list_t *pattern_list = NULL; + + if (!this || !defrag || !data) + goto out; + + /* Get the pattern for pattern list. "pattern:<optional-size>" + * eg: *avi, *pdf:10MB, *:1TB + */ + pattern_str = strtok_r (data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup (pattern_str); + pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t), + 1); + if (!pattern_list) { + goto out; + } + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (!pattern) + goto out; + if (!num) { + if (gf_string2bytesize_uint64(pattern, &pattern_list->size) + == 0) { + pattern = "*"; + } + } else if (gf_string2bytesize_uint64 (num, &pattern_list->size) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + memcpy (pattern_list->path_pattern, pattern, strlen (dup_str)); + + if (!defrag->defrag_pattern) + temp_list = NULL; + else + temp_list = defrag->defrag_pattern; + + pattern_list->next = temp_list; + + defrag->defrag_pattern = pattern_list; + pattern_list = NULL; + + GF_FREE (dup_str); + dup_str = NULL; + + pattern_str = strtok_r (NULL, ",", &tmp_str); + } + + ret = 0; +out: + if (ret) + GF_FREE (pattern_list); + GF_FREE (dup_str); + + return ret; +} + +int +dht_init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + char *node_uuid = NULL; + + + GF_VALIDATE_OR_GOTO ("dht", this, err); + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "Distribute needs more than one subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile"); + } + + conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); + if (!conf) { + goto err; + } + + ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); + + if (cmd) { + defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t), + gf_defrag_info_mt); + + GF_VALIDATE_OR_GOTO (this->name, defrag, err); + + LOCK_INIT (&defrag->lock); + + defrag->is_exiting = 0; + + conf->defrag = defrag; + + ret = dict_get_str (this->options, "node-uuid", &node_uuid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "node-uuid not " + "specified"); + goto err; + } + + if (uuid_parse (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse " + "glusterd node uuid"); + goto err; + } + + defrag->cmd = cmd; + + defrag->stats = _gf_false; + } + + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; + if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean */ + if (strcasecmp (temp_str, "auto")) + gf_string2boolean (temp_str, &conf->search_unhashed); + else + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + + GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, + err); + + GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); + + GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + err); + + GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, + err); + + conf->dir_spread_cnt = conf->subvolume_cnt; + GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, + uint32, err); + + GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, + bool, err); + + GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err); + + if (defrag) { + GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err); + if (dict_get_str (this->options, "rebalance-filter", &temp_str) + == 0) { + if (gf_defrag_pattern_list_fill (this, defrag, temp_str) + == -1) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse" + " rebalance-filter (%s)", temp_str); + goto err; + } + } + } + + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100) + conf->disk_unit = 'p'; + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto err; + } + + dht_init_regex (this, this->options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, this->options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + LOCK_INIT (&conf->layout_lock); + + conf->gen = 1; + + this->local_pool = mem_pool_new (dht_local_t, 512); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto err; + } + + GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err); + gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR, + conf->xattr_name); + gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name); + if (!conf->link_xattr_name || !conf->wild_xattr_name) { + goto err; + } + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf->du_stats); + + GF_FREE (conf->defrag); + + GF_FREE (conf->xattr_name); + GF_FREE (conf->link_xattr_name); + GF_FREE (conf->wild_xattr_name); + + GF_FREE (conf); + } + + return -1; +} + + +struct volume_options options[] = { + { .key = {"lookup-unhashed"}, + .value = {"auto", "yes", "no", "enable", "disable", "1", "0", + "on", "off"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = "This option if set to ON, does a lookup through " + "all the sub-volumes, in case a lookup didn't return any result " + "from the hash subvolume. If set to OFF, it does not do a lookup " + "on the remaining subvolumes." + }, + { .key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "10%", + .description = "Percentage/Size of disk space, after which the " + "process starts balancing out the cluster, and logs will appear " + "in log files", + }, + { .key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "after system has only N% of inodes, warnings " + "starts to appear in log files", + }, + { .key = {"unhashed-sticky-bit"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + { .key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option if set to ON, forces the use of " + "readdirp, and hence also displays the stats of the files." + }, + { .key = {"assert-no-child-down"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON, in the event of " + "CHILD_DOWN, will call exit." + }, + { .key = {"directory-layout-spread"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies the directory layout spread. Takes number " + "of subvolumes as default value." + }, + { .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + .description = "This option if set to ON, decommissions " + "the brick, so that no new data is allowed to be created " + "on that brick." + }, + { .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, + { .key = {"rebalance-stats"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON displays and logs the " + " time taken for migration of each file, during the rebalance " + "process. If set to OFF, the rebalance logs will only display the " + "time spent in each directory." + }, + { .key = {"readdir-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON enables the optimization " + "that allows DHT to requests non-first subvolumes to filter out " + "directory entries." + }, + { .key = {"rsync-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by rsync, to prevent relocation when the " + "file is renamed." + }, + { .key = {"extra-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by an application, to prevent relocation when " + "the file is renamed." + }, + { .key = {"rebalance-filter"}, + .type = GF_OPTION_TYPE_STR, + }, + + { .key = {"xattr-name"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "trusted.glusterfs.dht", + .description = "Base for extended attributes used by this " + "translator instance, to avoid conflicts with others above or " + "below it." + }, + + /* NUFA option */ + { .key = {"local-volume-name"}, + .type = GF_OPTION_TYPE_XLATOR + }, + + /* switch option */ + { .key = {"pattern.switch.case"}, + .type = GF_OPTION_TYPE_ANY + }, + + { .key = {NULL} }, +}; diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index e004dee0d..fc0ca2f77 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -23,238 +14,76 @@ #include "config.h" #endif -/* TODO: add NS locking */ - -#include "dht-common.c" - -/* TODO: - - use volumename in xattr instead of "dht" - - use NS locks - - handle all cases in self heal layout reconstruction - - complete linkfile selfheal -*/ - - - -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; -} - -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; - - conf = this->private; - - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - FREE (conf->file_layouts[i]); - } - FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - FREE (conf->default_dir_layout); - - if (conf->subvolumes) - FREE (conf->subvolumes); - - if (conf->subvolume_status) - FREE (conf->subvolume_status); - - FREE (conf); - } - - return; -} - -int -init (xlator_t *this) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "Distribute needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = CALLOC (1, sizeof (*conf)); - if (!conf) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - conf->search_unhashed = 0; - - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->search_unhashed); - } - - conf->unhashed_sticky_bit = 0; - - if (dict_get_str (this->options, "unhashed-sticky-bit", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->unhashed_sticky_bit); - } - - conf->disk_unit = 'p'; - conf->min_free_disk = 10; - - if (dict_get_str (this->options, "min-free-disk", &temp_str) == 0) { - if (gf_string2percent (temp_str, &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - } - } else { - gf_string2bytesize (temp_str, &conf->min_free_disk); - conf->disk_unit = 'b'; - } - } - - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - conf->du_stats = CALLOC (conf->subvolume_cnt, sizeof (dht_du_t)); - if (!conf->du_stats) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - - conf->gen = 1; - - this->private = conf; - - return 0; - -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - FREE (conf->file_layouts[i]); - } - FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - FREE (conf->default_dir_layout); - - if (conf->subvolumes) - FREE (conf->subvolumes); - - if (conf->subvolume_status) - FREE (conf->subvolume_status); - - if (conf->du_stats) - FREE (conf->du_stats); - - FREE (conf); - } - - return -1; -} +#include "statedump.h" +#include "dht-common.h" +class_methods_t class_methods = { + .init = dht_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; struct xlator_fops fops = { - .lookup = dht_lookup, - .mknod = dht_mknod, - .create = dht_create, - - .stat = dht_stat, - .chmod = dht_chmod, - .chown = dht_chown, - .fchown = dht_fchown, - .fchmod = dht_fchmod, - .fstat = dht_fstat, - .utimens = dht_utimens, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .access = dht_access, - .readlink = dht_readlink, - .setxattr = dht_setxattr, - .getxattr = dht_getxattr, - .removexattr = dht_removexattr, - .open = dht_open, - .readv = dht_readv, - .writev = dht_writev, - .flush = dht_flush, - .fsync = dht_fsync, - .statfs = dht_statfs, - .lk = dht_lk, - .opendir = dht_opendir, - .readdir = dht_readdir, - .fsyncdir = dht_fsyncdir, - .symlink = dht_symlink, - .unlink = dht_unlink, - .link = dht_link, - .mkdir = dht_mkdir, - .rmdir = dht_rmdir, - .rename = dht_rename, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, - .entrylk = dht_entrylk, - .fentrylk = dht_fentrylk, - .xattrop = dht_xattrop, - .fxattrop = dht_fxattrop, -#if 0 - .setdents = dht_setdents, - .getdents = dht_getdents, - .checksum = dht_checksum, -#endif + .lookup = dht_lookup, + .mknod = dht_mknod, + .create = dht_create, + + .open = dht_open, + .statfs = dht_statfs, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + + /* Inode read operations */ + .stat = dht_stat, + .fstat = dht_fstat, + .access = dht_access, + .readlink = dht_readlink, + .getxattr = dht_getxattr, + .fgetxattr = dht_fgetxattr, + .readv = dht_readv, + .flush = dht_flush, + .fsync = dht_fsync, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .lk = dht_lk, + + /* Inode write operations */ + .fremovexattr = dht_fremovexattr, + .removexattr = dht_removexattr, + .setxattr = dht_setxattr, + .fsetxattr = dht_fsetxattr, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .writev = dht_writev, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, + .fsetattr = dht_fsetattr, + .fallocate = dht_fallocate, + .discard = dht_discard, + .zerofill = dht_zerofill, }; - -struct xlator_mops mops = { +struct xlator_dumpops dumpops = { + .priv = dht_priv_dump, + .inodectx = dht_inodectx_dump, }; struct xlator_cbks cbks = { -// .release = dht_release, +// .release = dht_release, // .releasedir = dht_releasedir, - .forget = dht_forget -}; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, + .forget = dht_forget }; +; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index c8ce7fdee..e934acdf0 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -23,16 +14,18 @@ #include "config.h" #endif -#include "dht-common.c" +#include "dht-common.h" /* TODO: all 'TODO's in dht.c holds good */ -int +extern struct volume_options options[]; + +int nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf, dict_t *xattr) + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - dht_layout_t *layout = NULL; xlator_t *subvol = NULL; char is_linkfile = 0; char is_dir = 0; @@ -41,8 +34,8 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc_t *loc = NULL; int i = 0; call_frame_t *prev = NULL; - int call_cnt = 0; - + int call_cnt = 0; + int ret = 0; conf = this->private; @@ -50,58 +43,52 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; loc = &local->loc; - if (ENTRY_MISSING (op_ret, op_errno)) { - if (conf->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } - } + if (ENTRY_MISSING (op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + } if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { /* non-directory and not a linkfile */ + ret = dht_layout_preset (this, prev->this, inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "could not set pre-set layout for subvol %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto err; + } - dht_itransform (this, prev->this, stbuf->st_ino, - &stbuf->st_ino); - - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto err; - } - - inode_ctx_put (inode, this, (uint64_t)(long)layout); goto out; } if (is_dir) { call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + local->call_cnt = call_cnt; local->inode = inode_ref (inode); local->xattr = dict_ref (xattr); - local->op_ret = 0; - local->op_errno = 0; + local->op_ret = 0; + local->op_errno = 0; - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_DEBUG, - "memory allocation failed :("); - goto err; - } + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + goto err; + } for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_lookup_dir_cbk, @@ -118,51 +105,52 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (this->name, GF_LOG_DEBUG, "linkfile not having link subvolume. path=%s", loc->path); - dht_lookup_everywhere (frame, this, loc); - return 0; + dht_lookup_everywhere (frame, this, loc); + return 0; } - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + STACK_WIND (frame, dht_lookup_linkfile_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); } return 0; out: - if (!local->hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - local->loc.path); - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, dht_lookup_cbk, - local->hashed_subvol, local->hashed_subvol->fops->lookup, - &local->loc, local->xattr_req); - - return 0; - - err: - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + if (!local->hashed_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + STACK_WIND (frame, dht_lookup_cbk, + local->hashed_subvol, local->hashed_subvol->fops->lookup, + &local->loc, local->xattr_req); + + return 0; + +err: + DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, + inode, stbuf, xattr, postparent); return 0; } int nufa_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) + loc_t *loc, dict_t *xattr_req) { xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; xlator_t *subvol = NULL; dht_local_t *local = NULL; - dht_conf_t *conf = NULL; + dht_conf_t *conf = NULL; int ret = -1; int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); @@ -171,40 +159,26 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc->inode, err); VALIDATE_OR_GOTO (loc->path, err); - conf = this->private; - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } + conf = this->private; - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "copying location failed for path=%s", - loc->path); + local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; goto err; } - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); - } else { - local->xattr_req = dict_new (); - } + if (xattr_req) { + local->xattr_req = dict_ref (xattr_req); + } else { + local->xattr_req = dict_new (); + } - hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - cached_subvol = dht_subvol_get_cached (this, local->loc.inode); - - local->cached_subvol = cached_subvol; - local->hashed_subvol = hashed_subvol; + hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - if (is_revalidate (loc)) { - layout = dht_layout_get (this, loc->inode); + local->hashed_subvol = hashed_subvol; + if (is_revalidate (loc)) { + layout = local->layout; if (!layout) { gf_log (this->name, GF_LOG_DEBUG, "revalidate without cache. path=%s", @@ -213,530 +187,490 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, goto err; } - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_DEBUG, - "incomplete layout failure for path=%s", - loc->path); - op_errno = ESTALE; - goto err; - } - - local->inode = inode_ref (loc->inode); - local->st_ino = loc->inode->ino; - - local->call_cnt = layout->cnt; - call_cnt = local->call_cnt; - - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, - * revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - loc, local->xattr_req); - - if (!--call_cnt) - break; - } - } else { - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); - - /* Send it to only local volume */ - STACK_WIND (frame, nufa_local_lookup_cbk, - conf->local_volume, - conf->local_volume->fops->lookup, - loc, local->xattr_req); - } + if (layout->gen && (layout->gen < conf->gen)) { + gf_log (this->name, GF_LOG_DEBUG, + "incomplete layout failure for path=%s", + loc->path); + dht_layout_unref (this, local->layout); + goto do_fresh_lookup; + } + + local->inode = inode_ref (loc->inode); + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32 (local->xattr_req, + conf->xattr_name, 4 * 4); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set dict value."); + op_errno = -1; + goto err; + } + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + loc, local->xattr_req); + + if (!--call_cnt) + break; + } + } else { + do_fresh_lookup: + ret = dict_set_uint32 (local->xattr_req, + conf->xattr_name, 4 * 4); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set dict value."); + op_errno = -1; + goto err; + } + + ret = dict_set_uint32 (local->xattr_req, + conf->link_xattr_name, 256); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set dict value."); + op_errno = -1; + goto err; + } + + /* Send it to only local volume */ + STACK_WIND (frame, nufa_local_lookup_cbk, + (xlator_t *)conf->private, + ((xlator_t *)conf->private)->fops->lookup, + loc, local->xattr_req); + } return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); + return 0; } int -nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf) +nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - dht_conf_t *conf = NULL; - - local = frame->local; - prev = cookie; - conf = this->private; - - if (op_ret == -1) - goto err; - - STACK_WIND (frame, dht_create_cbk, - local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd); - - return 0; - - err: - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); - return 0; + dht_local_t *local = NULL; + + local = frame->local; + + if (op_ret == -1) + goto err; + + STACK_WIND (frame, dht_create_cbk, + local->cached_subvol, local->cached_subvol->fops->create, + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); + + return 0; + +err: + DHT_STACK_UNWIND (create, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); + return 0; } int nufa_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; xlator_t *avail_subvol = NULL; - int op_errno = -1; - int ret = -1; + int op_errno = -1; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); - conf = this->private; + conf = this->private; dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } - - avail_subvol = conf->local_volume; - if (dht_is_subvol_filled (this, conf->local_volume)) { - avail_subvol = - dht_free_disk_available_subvol (this, - conf->local_volume); + local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; } - + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + avail_subvol = conf->private; + if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { + avail_subvol = + dht_free_disk_available_subvol (this, + (xlator_t *)conf->private, + local); + } + if (subvol != avail_subvol) { /* create a link file instead of actual file */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->fd = fd_ref (fd); + local->params = dict_ref (params); local->mode = mode; local->flags = flags; - + local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - nufa_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, nufa_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - + STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd); - + loc, flags, mode, umask, fd, params); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (create, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } int nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct stat *stbuf) + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - dht_conf_t *conf = NULL; - - local = frame->local; - prev = cookie; - conf = this->private; - - if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, - local->cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev); - - return 0; - } - - DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); - return 0; + dht_local_t *local = NULL; + + local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } + + if (op_ret >= 0) { + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, + local->cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, + local->umask, local->params); + + return 0; + } +err: + WIPE (postparent); + WIPE (preparent); + + DHT_STACK_UNWIND (link, frame, op_ret, op_errno, + inode, stbuf, preparent, postparent, xdata); + return 0; } int nufa_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; xlator_t *avail_subvol = NULL; - int op_errno = -1; - int ret = -1; + int op_errno = -1; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); - conf = this->private; + conf = this->private; dht_get_du_info (frame, this, loc); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } /* Consider the disksize in consideration */ - avail_subvol = conf->local_volume; - if (dht_is_subvol_filled (this, conf->local_volume)) { - avail_subvol = - dht_free_disk_available_subvol (this, - conf->local_volume); + avail_subvol = conf->private; + if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { + avail_subvol = + dht_free_disk_available_subvol (this, + (xlator_t *)conf->private, + local); } - if (avail_subvol != subvol) { - /* Create linkfile first */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->mode = mode; - local->rdev = rdev; - local->cached_subvol = avail_subvol; - - dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, + if (avail_subvol != subvol) { + /* Create linkfile first */ + + local->params = dict_ref (params); + local->mode = mode; + local->umask = umask; + local->rdev = rdev; + local->cached_subvol = avail_subvol; + + dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this, avail_subvol, subvol, loc); - return 0; - } + return 0; + } - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + gf_log (this->name, GF_LOG_TRACE, + "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (mknod, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; -} - -void -fini (xlator_t *this) +gf_boolean_t +same_first_part (char *str1, char term1, char *str2, char term2) { - int i = 0; - dht_conf_t *conf = NULL; - - conf = this->private; - - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - FREE (conf->file_layouts[i]); - } - FREE (conf->file_layouts); + gf_boolean_t ended1; + gf_boolean_t ended2; + + for (;;) { + ended1 = ((*str1 == '\0') || (*str1 == term1)); + ended2 = ((*str2 == '\0') || (*str2 == term2)); + if (ended1 && ended2) { + return _gf_true; } + if (ended1 || ended2 || (*str1 != *str2)) { + return _gf_false; + } + ++str1; + ++str2; + } +} - if (conf->default_dir_layout) - FREE (conf->default_dir_layout); - - if (conf->subvolumes) - FREE (conf->subvolumes); +typedef struct nufa_args { + xlator_t *this; + char *volname; + gf_boolean_t addr_match; +} nufa_args_t; - if (conf->subvolume_status) - FREE (conf->subvolume_status); +static void +nufa_find_local_brick (xlator_t *xl, void *data) +{ + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; + + if (strcmp (xl->name, local_volname) == 0) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s", + local_volname); + return; + } - FREE (conf); + if (!addr_match) + return; + + ret = dict_get_str (xl->options, "remote-host", &brick_host); + if ((ret == 0) && + (gf_is_same_address (local_volname, brick_host) || + gf_is_local_addr (brick_host))) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using the first local " + "subvol %s", xl->name); + return; } - return; } -int -init (xlator_t *this) +static void +nufa_to_dht (xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_list_t *trav = NULL; - data_t *data = NULL; - char *local_volname = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - char my_hostname[256]; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "NUFA needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = CALLOC (1, sizeof (*conf)); - if (!conf) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; - } - - conf->search_unhashed = 0; + GF_ASSERT (this); + GF_ASSERT (this->fops); - if (dict_get_str (this->options, "lookup-unhashed", - &temp_str) == 0) { - gf_string2boolean (temp_str, - &conf->search_unhashed); - } - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; +} - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; +int +nufa_find_local_subvol (xlator_t *this, + void (*fn) (xlator_t *each, void* data), void *data) +{ + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first (this, fn, data); + if (!conf->private) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local " + "brick"); + return -1; } - LOCK_INIT (&conf->subvolume_lock); - - conf->gen = 1; - - local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); - } - - if (ret == 0) - local_volname = my_hostname; - - data = dict_get (this->options, "local-volume-name"); - if (data) { - local_volname = data->data; - } - - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, local_volname) == 0) - break; - trav = trav->next; - } - - if (!trav) { - gf_log (this->name, GF_LOG_ERROR, - "Could not find subvolume named '%s'. " - "Please define volume with the name as the hostname " - "or override it with 'option local-volume-name'", - local_volname); - goto err; - } - /* The volume specified exists */ - conf->local_volume = trav->xlator; - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; + candidate = conf->private; + trav = candidate->parents; + while (trav) { + + parent = trav->xlator; + if (strcmp (parent->type, "cluster/nufa") == 0) { + gf_log (this->name, GF_LOG_INFO, "Found local subvol, " + "%s", candidate->name); + ret = 0; + conf->private = candidate; + break; } - } - conf->du_stats = CALLOC (conf->subvolume_cnt, sizeof (dht_du_t)); - if (!conf->du_stats) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto err; + candidate = parent; + trav = parent->parents; } - this->private = conf; - - return 0; + return ret; +} -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - FREE (conf->file_layouts[i]); - } - FREE (conf->file_layouts); - } +int +nufa_init (xlator_t *this) +{ + data_t *data = NULL; + char *local_volname = NULL; + int ret = -1; + char my_hostname[256]; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = {0, }; - if (conf->default_dir_layout) - FREE (conf->default_dir_layout); + ret = dht_init(this); + if (ret) { + return ret; + } - if (conf->subvolumes) - FREE (conf->subvolumes); + if ((data = dict_get (this->options, "local-volume-name"))) { + local_volname = data->data; - if (conf->subvolume_status) - FREE (conf->subvolume_status); + } else { + addr_match = _gf_true; + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret == 0) + local_volname = my_hostname; - if (conf->du_stats) - FREE (conf->du_stats); + else + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); - FREE (conf); } - return -1; + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args); + if (ret) { + gf_log (this->name, GF_LOG_INFO, + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht (this); + } + return 0; } -struct xlator_fops fops = { - .lookup = nufa_lookup, - .create = nufa_create, - .mknod = nufa_mknod, - - .stat = dht_stat, - .chmod = dht_chmod, - .chown = dht_chown, - .fchown = dht_fchown, - .fchmod = dht_fchmod, - .fstat = dht_fstat, - .utimens = dht_utimens, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .access = dht_access, - .readlink = dht_readlink, - .setxattr = dht_setxattr, - .getxattr = dht_getxattr, - .removexattr = dht_removexattr, - .open = dht_open, - .readv = dht_readv, - .writev = dht_writev, - .flush = dht_flush, - .fsync = dht_fsync, - .statfs = dht_statfs, - .lk = dht_lk, - .opendir = dht_opendir, - .readdir = dht_readdir, - .fsyncdir = dht_fsyncdir, - .symlink = dht_symlink, - .unlink = dht_unlink, - .link = dht_link, - .mkdir = dht_mkdir, - .rmdir = dht_rmdir, - .rename = dht_rename, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, - .entrylk = dht_entrylk, - .fentrylk = dht_fentrylk, - .xattrop = dht_xattrop, - .fxattrop = dht_fxattrop, -#if 0 - .setdents = dht_setdents, - .getdents = dht_getdents, - .checksum = dht_checksum, -#endif +class_methods_t class_methods = { + .init = nufa_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify }; -struct xlator_mops mops = { +struct xlator_fops fops = { + .lookup = nufa_lookup, + .create = nufa_create, + .mknod = nufa_mknod, + + .stat = dht_stat, + .fstat = dht_fstat, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, }; struct xlator_cbks cbks = { -// .release = dht_release, -// .releasedir = dht_releasedir, - .forget = dht_forget -}; - - -struct volume_options options[] = { - { .key = {"local-volume-name"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"lookup-unhashed"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, + .forget = dht_forget }; diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c new file mode 100644 index 000000000..2717ce975 --- /dev/null +++ b/xlators/cluster/dht/src/switch.c @@ -0,0 +1,904 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.h" +#include "dht-mem-types.h" + +#include <sys/time.h> +#include <stdlib.h> +#include <fnmatch.h> +#include <string.h> + +extern struct volume_options options[]; + +struct switch_sched_array { + xlator_t *xl; + int32_t eligible; + int32_t considered; +}; + +/* Select one of this struct based on the path's pattern match */ +struct switch_struct { + struct switch_struct *next; + struct switch_sched_array *array; + int32_t node_index; /* Index of the node in + this pattern. */ + int32_t num_child; /* Total num of child nodes + with this pattern. */ + char path_pattern[256]; +}; + +/* TODO: all 'TODO's in dht.c holds good */ +/* This function should return child node as '*:subvolumes' is inserterd */ + +static int32_t +gf_switch_valid_child (xlator_t *this, const char *child) +{ + xlator_list_t *children = NULL; + int32_t ret = 0; + + children = this->children; + while (children) { + if (!strcmp (child, children->xlator->name)) { + ret = 1; + break; + } + children = children->next; + } + + return ret; +} + +static xlator_t * +get_switch_matching_subvol (const char *path, dht_conf_t *conf, + xlator_t *hashed_subvol) +{ + struct switch_struct *cond = NULL; + struct switch_struct *trav = NULL; + char *pathname = NULL; + int idx = 0; + xlator_t *subvol = NULL; + + cond = conf->private; + subvol = hashed_subvol; + if (!cond) + goto out; + + pathname = gf_strdup (path); + if (!pathname) + goto out; + + trav = cond; + while (trav) { + if (fnmatch (trav->path_pattern, + pathname, FNM_NOESCAPE) == 0) { + for (idx = 0; idx < trav->num_child; idx++) { + if (trav->array[idx].xl == hashed_subvol) + goto out; + } + idx = trav->node_index++; + trav->node_index %= trav->num_child; + subvol = trav->array[idx].xl; + goto out; + } + trav = trav->next; + } +out: + GF_FREE (pathname); + + return subvol; +} + + +int +switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + xlator_t *subvol = NULL; + char is_linkfile = 0; + char is_dir = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + call_frame_t *prev = NULL; + int call_cnt = 0; + int ret = 0; + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING (op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + + ret = dht_layout_preset (this, prev->this, inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "could not set pre-set layout for subvol %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto err; + } + + goto out; + } + + if (is_dir) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->inode = inode_ref (inode); + local->xattr = dict_ref (xattr); + + local->op_ret = 0; + local->op_errno = 0; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_DEBUG, + "memory allocation failed :("); + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + } + + if (is_linkfile) { + subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "linkfile not having link subvolume. path=%s", + loc->path); + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + STACK_WIND (frame, dht_lookup_linkfile_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +out: + if (!local->hashed_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + STACK_WIND (frame, dht_lookup_cbk, + local->hashed_subvol, local->hashed_subvol->fops->lookup, + &local->loc, local->xattr_req); + + return 0; + +err: + DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, + inode, stbuf, xattr, NULL); + return 0; +} + +int +switch_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref (xattr_req); + } else { + local->xattr_req = dict_new (); + } + + hashed_subvol = dht_subvol_get_hashed (this, &local->loc); + cached_subvol = local->cached_subvol; + + local->hashed_subvol = hashed_subvol; + + if (is_revalidate (loc)) { + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "revalidate without cache. path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + if (layout->gen && (layout->gen < conf->gen)) { + gf_log (this->name, GF_LOG_DEBUG, + "incomplete layout failure for path=%s", + loc->path); + dht_layout_unref (this, local->layout); + goto do_fresh_lookup; + } + + local->inode = inode_ref (loc->inode); + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' + * attribute, revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32 (local->xattr_req, + conf->xattr_name, 4 * 4); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "failed to set dict value for %s", + conf->xattr_name); + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + loc, local->xattr_req); + + if (!--call_cnt) + break; + } + } else { + do_fresh_lookup: + ret = dict_set_uint32 (local->xattr_req, + conf->xattr_name, 4 * 4); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "failed to set dict value for %s", + conf->xattr_name); + + ret = dict_set_uint32 (local->xattr_req, + conf->link_xattr_name, 256); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "failed to set dict value for %s", + conf->link_xattr_name); + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s, " + "checking on all the subvols to see if " + "it is a directory", loc->path); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, + conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } + + /* */ + cached_subvol = get_switch_matching_subvol (loc->path, conf, + hashed_subvol); + if (cached_subvol == hashed_subvol) { + STACK_WIND (frame, dht_lookup_cbk, + hashed_subvol, + hashed_subvol->fops->lookup, + loc, local->xattr_req); + } else { + STACK_WIND (frame, switch_local_lookup_cbk, + cached_subvol, + cached_subvol->fops->lookup, + loc, local->xattr_req); + } + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, + NULL, NULL, NULL, NULL); + return 0; +} + +int +switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (op_ret == -1) + goto err; + + STACK_WIND (frame, dht_create_cbk, + local->cached_subvol, local->cached_subvol->fops->create, + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); + + return 0; + +err: + DHT_STACK_UNWIND (create, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int +switch_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *params) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + conf = this->private; + + dht_get_du_info (frame, this, loc); + + local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); + if (dht_is_subvol_filled (this, avail_subvol)) { + avail_subvol = + dht_free_disk_available_subvol (this, avail_subvol, + local); + } + + if (subvol != avail_subvol) { + /* create a link file instead of actual file */ + local->mode = mode; + local->flags = flags; + local->umask = umask; + local->cached_subvol = avail_subvol; + dht_linkfile_create (frame, switch_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); + return 0; + } + + gf_log (this->name, GF_LOG_TRACE, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, umask, fd, params); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (create, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); + + return 0; +} + +int +switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } + + if (op_ret >= 0) { + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, + local->cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, + local->umask, local->params); + + return 0; + } +err: + DHT_STACK_UNWIND (link, frame, op_ret, op_errno, + inode, stbuf, preparent, postparent, xdata); + return 0; +} + + +int +switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + conf = this->private; + + dht_get_du_info (frame, this, loc); + + local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + /* Consider the disksize in consideration */ + avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); + if (dht_is_subvol_filled (this, avail_subvol)) { + avail_subvol = + dht_free_disk_available_subvol (this, avail_subvol, + local); + } + + if (avail_subvol != subvol) { + /* Create linkfile first */ + + local->params = dict_ref (params); + local->mode = mode; + local->umask = umask; + local->rdev = rdev; + local->cached_subvol = avail_subvol; + + dht_linkfile_create (frame, switch_mknod_linkfile_cbk, + this, avail_subvol, subvol, loc); + return 0; + } + + gf_log (this->name, GF_LOG_TRACE, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (mknod, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + + return 0; +} + + +void +switch_fini (xlator_t *this) +{ + dht_conf_t *conf = NULL; + struct switch_struct *trav = NULL; + struct switch_struct *prev = NULL; + + conf = this->private; + + if (conf) { + trav = (struct switch_struct *)conf->private; + conf->private = NULL; + while (trav) { + GF_FREE (trav->array); + prev = trav; + trav = trav->next; + GF_FREE (prev); + } + } + + dht_fini(this); +} + +int +set_switch_pattern (xlator_t *this, dht_conf_t *conf, + const char *pattern_str) +{ + int flag = 0; + int idx = 0; + int index = 0; + int child_count = 0; + char *tmp = NULL; + char *tmp1 = NULL; + char *child = NULL; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *dup_childs = NULL; + char *switch_str = NULL; + char *pattern = NULL; + char *childs = NULL; + char *option_string = NULL; + struct switch_struct *switch_buf = NULL; + struct switch_struct *switch_opt = NULL; + struct switch_struct *trav = NULL; + struct switch_sched_array *switch_buf_array = NULL; + xlator_list_t *trav_xl = NULL; + + trav_xl = this->children; + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + child_count = index; + switch_buf_array = GF_CALLOC ((index + 1), + sizeof (struct switch_sched_array), + gf_switch_mt_switch_sched_array); + if (!switch_buf_array) + goto err; + + trav_xl = this->children; + index = 0; + + while (trav_xl) { + switch_buf_array[index].xl = trav_xl->xlator; + switch_buf_array[index].eligible = 1; + trav_xl = trav_xl->next; + index++; + } + + /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */ + + /* Get the pattern for considering switch case. + "option block-size *avi:10MB" etc */ + option_string = gf_strdup (pattern_str); + switch_str = strtok_r (option_string, ";", &tmp_str); + while (switch_str) { + dup_str = gf_strdup (switch_str); + switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), + gf_switch_mt_switch_struct); + if (!switch_opt) { + GF_FREE (dup_str); + goto err; + } + + pattern = strtok_r (dup_str, ":", &tmp_str1); + childs = strtok_r (NULL, ":", &tmp_str1); + if (strncmp (pattern, "*", 2) == 0) { + gf_log ("switch", GF_LOG_INFO, + "'*' pattern will be taken by default " + "for all the unconfigured child nodes," + " hence neglecting current option"); + switch_str = strtok_r (NULL, ";", &tmp_str); + GF_FREE (switch_opt); + GF_FREE (dup_str); + continue; + } + GF_FREE (dup_str); + memcpy (switch_opt->path_pattern, pattern, strlen (pattern)); + if (childs) { + dup_childs = gf_strdup (childs); + child = strtok_r (dup_childs, ",", &tmp); + while (child) { + if (gf_switch_valid_child (this, child)) { + idx++; + child = strtok_r (NULL, ",", &tmp); + } else { + gf_log (this->name, GF_LOG_ERROR, + "%s is not a subvolume of %s. " + "pattern can only be scheduled " + "only to a subvolume of %s", + child, this->name, this->name); + goto err; + } + } + GF_FREE (dup_childs); + child = strtok_r (childs, ",", &tmp1); + switch_opt->num_child = idx; + switch_opt->array = GF_CALLOC (1, (idx * + sizeof (struct switch_sched_array)), + gf_switch_mt_switch_sched_array); + if (!switch_opt->array) + goto err; + idx = 0; + while (child) { + for (index = 0; index < child_count; index++) { + if (strcmp (switch_buf_array[index].xl->name, + child) == 0) { + gf_log ("switch", GF_LOG_DEBUG, + "'%s' pattern will be " + "scheduled to \"%s\"", + switch_opt->path_pattern, child); + /* + if (switch_buf_array[index-1].considered) { + gf_log ("switch", GF_LOG_DEBUG, + "ambiguity found, exiting"); + return -1; + } + */ + switch_opt->array[idx].xl = switch_buf_array[index].xl; + switch_buf_array[index].considered = 1; + idx++; + break; + } + } + child = strtok_r (NULL, ",", &tmp1); + } + } else { + /* error */ + gf_log ("switch", GF_LOG_ERROR, + "Check \"scheduler.switch.case\" " + "option in unify volume. Exiting"); + goto err; + } + + /* Link it to the main structure */ + if (switch_buf) { + /* there are already few entries */ + trav = switch_buf; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf = switch_opt; + } + switch_opt = NULL; + switch_str = strtok_r (NULL, ";", &tmp_str); + } + + /* Now, all the pattern based considerations done, so for all the + * remaining pattern, '*' to all the remaining child nodes + */ + { + for (index=0; index < child_count; index++) { + /* check for considered flag */ + if (switch_buf_array[index].considered) + continue; + flag++; + } + if (!flag) { + gf_log ("switch", GF_LOG_ERROR, + "No nodes left for pattern '*'. Exiting"); + goto err; + } + switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), + gf_switch_mt_switch_struct); + if (!switch_opt) + goto err; + + /* Add the '*' pattern to the array */ + memcpy (switch_opt->path_pattern, "*", 2); + switch_opt->num_child = flag; + switch_opt->array = + GF_CALLOC (1, + flag * sizeof (struct switch_sched_array), + gf_switch_mt_switch_sched_array); + if (!switch_opt->array) + goto err; + flag = 0; + for (index=0; index < child_count; index++) { + /* check for considered flag */ + if (switch_buf_array[index].considered) + continue; + gf_log ("switch", GF_LOG_DEBUG, + "'%s' pattern will be scheduled to \"%s\"", + switch_opt->path_pattern, + switch_buf_array[index].xl->name); + switch_opt->array[flag].xl = + switch_buf_array[index].xl; + switch_buf_array[index].considered = 1; + flag++; + } + if (switch_buf) { + /* there are already few entries */ + trav = switch_buf; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf = switch_opt; + } + switch_opt = NULL; + } + /* */ + conf->private = switch_buf; + + return 0; +err: + GF_FREE (switch_buf_array); + GF_FREE (switch_opt); + + if (switch_buf) { + trav = switch_buf; + while (trav) { + GF_FREE (trav->array); + switch_opt = trav; + trav = trav->next; + GF_FREE (switch_opt); + } + } + return -1; +} + + +int32_t +switch_init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + data_t *data = NULL; + int ret = -1; + + ret = dht_init(this); + if (ret) { + return ret; + } + conf = this->private; + + data = dict_get (this->options, "pattern.switch.case"); + if (data) { + /* TODO: */ + ret = set_switch_pattern (this, conf, data->data); + if (ret) { + goto err; + } + } + + this->private = conf; + return 0; + +err: + dht_fini(this); + return -1; +} + + +class_methods_t class_methods = { + .init = switch_init, + .fini = switch_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; + + +struct xlator_fops fops = { + .lookup = switch_lookup, + .create = switch_create, + .mknod = switch_mknod, + + .stat = dht_stat, + .fstat = dht_fstat, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, +}; + + +struct xlator_cbks cbks = { + .forget = dht_forget +}; diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c new file mode 100644 index 000000000..aa19ddc57 --- /dev/null +++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c @@ -0,0 +1,63 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "byte-order.h" + +int +dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) +{ + return 0; +} + +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout) +{ + return 0; +} + +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + return 0; +} + +int +dict_get_ptr (dict_t *this, char *key, void **ptr) +{ + return 0; +} + +int +dict_get_ptr_and_len (dict_t *this, char *key, void **ptr, int *len) +{ + return 0; +} + +int _gf_log (const char *domain, const char *file, + const char *function, int32_t line, gf_loglevel_t level, + const char *fmt, ...) +{ + return 0; +} + +int _gf_log_callingfn (const char *domain, const char *file, + const char *function, int32_t line, gf_loglevel_t level, + const char *fmt, ...) +{ + return 0; +} diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c new file mode 100644 index 000000000..b5233d235 --- /dev/null +++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c @@ -0,0 +1,124 @@ +/* + Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dht-common.h" +#include "logging.h" +#include "xlator.h" + +#include <stdarg.h> +#include <stddef.h> +#include <setjmp.h> +#include <inttypes.h> +#include <cmockery/pbc.h> +#include <cmockery/cmockery.h> + +/* + * Helper functions + */ + +static xlator_t * +helper_xlator_init(uint32_t num_types) +{ + xlator_t *xl; + int i, ret; + + REQUIRE(num_types > 0); + + xl = test_calloc(1, sizeof(xlator_t)); + assert_non_null(xl); + xl->mem_acct.num_types = num_types; + xl->mem_acct.rec = test_calloc(num_types, sizeof(struct mem_acct_rec)); + assert_non_null(xl->mem_acct.rec); + + xl->ctx = test_calloc(1, sizeof(glusterfs_ctx_t)); + assert_non_null(xl->ctx); + + for (i = 0; i < num_types; i++) { + ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock)); + assert_false(ret); + } + + ENSURE(num_types == xl->mem_acct.num_types); + ENSURE(NULL != xl); + + return xl; +} + +static int +helper_xlator_destroy(xlator_t *xl) +{ + int i, ret; + + for (i = 0; i < xl->mem_acct.num_types; i++) { + ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock)); + assert_int_equal(ret, 0); + } + + free(xl->mem_acct.rec); + free(xl->ctx); + free(xl); + return 0; +} + +/* + * Unit tests + */ +static void +test_dht_layout_new(void **state) +{ + xlator_t *xl; + dht_layout_t *layout; + dht_conf_t *conf; + int cnt; + + expect_assert_failure(dht_layout_new(NULL, 0)); + expect_assert_failure(dht_layout_new((xlator_t *)0x12345, -1)); + xl = helper_xlator_init(10); + + // xl->private is NULL + assert_null(xl->private); + cnt = 100; + layout = dht_layout_new(xl, cnt); + assert_non_null(layout); + assert_int_equal(layout->type, DHT_HASH_TYPE_DM); + assert_int_equal(layout->cnt, cnt); + assert_int_equal(layout->ref, 1); + assert_int_equal(layout->gen, 0); + assert_int_equal(layout->spread_cnt, 0); + free(layout); + + // xl->private is not NULL + cnt = 110; + conf = (dht_conf_t *)test_calloc(1, sizeof(dht_conf_t)); + assert_non_null(conf); + conf->dir_spread_cnt = 12345; + conf->gen = -123; + xl->private = conf; + + layout = dht_layout_new(xl, cnt); + assert_non_null(layout); + assert_int_equal(layout->type, DHT_HASH_TYPE_DM); + assert_int_equal(layout->cnt, cnt); + assert_int_equal(layout->ref, 1); + assert_int_equal(layout->gen, conf->gen); + assert_int_equal(layout->spread_cnt, conf->dir_spread_cnt); + free(layout); + + free(conf); + helper_xlator_destroy(xl); +} + +int main(void) { + const UnitTest tests[] = { + unit_test(test_dht_layout_new), + }; + + return run_tests(tests, "xlator_dht_layout"); +} diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am index 5f78a2965..5c1364b7f 100644 --- a/xlators/cluster/ha/src/Makefile.am +++ b/xlators/cluster/ha/src/Makefile.am @@ -1,15 +1,16 @@ xlator_LTLIBRARIES = ha.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster -ha_la_LDFLAGS = -module -avoidversion +ha_la_LDFLAGS = -module -avoid-version ha_la_SOURCES = ha-helpers.c ha.c ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = ha.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c index 4bc7d5e20..19be1ed27 100644 --- a/xlators/cluster/ha/src/ha-helpers.c +++ b/xlators/cluster/ha/src/ha-helpers.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #include "xlator.h" #include "call-stub.h" #include "defaults.h" @@ -49,12 +39,14 @@ int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd) goto out; } hafdp = (hafd_t *)(long)tmp_hafdp; - local = frame->local = CALLOC (1, sizeof (*local)); + local = frame->local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); if (local == NULL) { ret = -ENOMEM; goto out; } - local->state = CALLOC (1, child_count); + local->state = GF_CALLOC (1, child_count, + gf_ha_mt_child_count); if (local->state == NULL) { ret = -ENOMEM; goto out; @@ -141,11 +133,17 @@ int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) } } } - if (local->stub) + if (local->stub) { call_stub_destroy (local->stub); + local->stub = NULL; + } + if (local->fd) { - FREE (local->state); + GF_FREE (local->state); + local->state = NULL; + fd_unref (local->fd); + local->fd = NULL; } return 0; } @@ -164,7 +162,8 @@ int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode) local = frame->local; if (local == NULL) { - local = frame->local = CALLOC (1, sizeof (*local)); + local = frame->local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); if (local == NULL) { ret = -ENOMEM; goto out; diff --git a/xlators/cluster/ha/src/ha-mem-types.h b/xlators/cluster/ha/src/ha-mem-types.h new file mode 100644 index 000000000..e5e97d237 --- /dev/null +++ b/xlators/cluster/ha/src/ha-mem-types.h @@ -0,0 +1,26 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __HA_MEM_TYPES_H__ +#define __HA_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_ha_mem_types_ { + gf_ha_mt_ha_local_t = gf_common_mt_end + 1, + gf_ha_mt_hafd_t, + gf_ha_mt_char, + gf_ha_mt_child_count, + gf_ha_mt_xlator_t, + gf_ha_mt_ha_private_t, + gf_ha_mt_end +}; +#endif + diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c index b938071ed..3eccb516b 100644 --- a/xlators/cluster/ha/src/ha.c +++ b/xlators/cluster/ha/src/ha.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ /* generate errors randomly, code is simple now, better alogorithm * can be written to decide what error to be returned and when */ @@ -41,6 +31,41 @@ * - do not alloc the call-stub in case only one subvol is up. */ +void +ha_local_wipe (ha_local_t *local) +{ + if (local->stub) { + call_stub_destroy (local->stub); + local->stub = NULL; + } + + if (local->state) { + GF_FREE (local->state); + local->state = NULL; + } + + if (local->dict) { + dict_unref (local->dict); + local->dict = NULL; + } + + loc_wipe (&local->loc); + + if (local->fd) { + fd_unref (local->fd); + local->fd = NULL; + } + + if (local->inode) { + inode_unref (local->inode); + local->inode = NULL; + } + + GF_FREE (local); + return; +} + + int ha_forget (xlator_t *this, inode_t *inode) @@ -49,7 +74,7 @@ ha_forget (xlator_t *this, char *state = NULL; if (!inode_ctx_del (inode, this, &stateino)) { state = ((char *)(long)stateino); - FREE (state); + GF_FREE (state); } return 0; @@ -63,8 +88,9 @@ ha_lookup_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf, - dict_t *dict) + struct iatt *buf, + dict_t *dict, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -106,6 +132,7 @@ ha_lookup_cbk (call_frame_t *frame, if (local->op_ret == -1 && op_ret == 0) { local->op_ret = 0; local->buf = *buf; + local->postparent = *postparent; if (dict) local->dict = dict_ref (dict); } @@ -127,7 +154,8 @@ ha_lookup_cbk (call_frame_t *frame, local->op_errno, inode, &local->buf, - ctx); + ctx, + &local->postparent); if (inode) inode_unref (inode); if (ctx) @@ -148,19 +176,33 @@ ha_lookup (call_frame_t *frame, char *state = NULL; xlator_t **children = NULL; int ret = -1; + int32_t op_errno = EINVAL; local = frame->local; pvt = this->private; child_count = pvt->child_count; children = pvt->children; - frame->local = local = CALLOC (1, sizeof (*local)); + frame->local = local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto unwind; + } + child_count = pvt->child_count; local->inode = inode_ref (loc->inode); ret = inode_ctx_get (loc->inode, this, NULL); if (ret) { - state = CALLOC (1, child_count); + state = GF_CALLOC (1, child_count, gf_ha_mt_child_count); + if (state == NULL) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto unwind; + } + inode_ctx_put (loc->inode, this, (uint64_t)(long)state); } else local->revalidate = 1; @@ -178,6 +220,14 @@ ha_lookup (call_frame_t *frame, xattr_req); } return 0; + +unwind: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); + + ha_local_wipe (local); + return 0; } int32_t @@ -186,7 +236,7 @@ ha_stat_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *buf) { int ret = -1; @@ -229,135 +279,25 @@ err: return 0; } - int32_t -ha_chmod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - buf); - } - return 0; -} - int32_t -ha_chmod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_chmod_stub (frame, ha_chmod, loc, mode); - - STACK_WIND_COOKIE (frame, - ha_chmod_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->chmod, - loc, - mode); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL); - return 0; -} - - int32_t -ha_fchmod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) +ha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost) { int ret = -1; ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - buf); - } - return 0; -} - -int32_t -ha_fchmod (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - mode_t mode) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; + STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost); } - local = frame->local; - local->stub = fop_fchmod_stub (frame, ha_fchmod, fd, mode); - - STACK_WIND_COOKIE (frame, - ha_fchmod_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->fchmod, - fd, - mode); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL); return 0; } - int32_t -ha_chown_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - buf); - } - return 0; -} int32_t -ha_chown (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - uid_t uid, - gid_t gid) +ha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid) { ha_local_t *local = NULL; int op_errno = 0; @@ -368,49 +308,24 @@ ha_chown (call_frame_t *frame, goto err; } local = frame->local; - local->stub = fop_chown_stub (frame, ha_chown, loc, uid, gid); + local->stub = fop_setattr_stub (frame, ha_setattr, loc, stbuf, valid); - STACK_WIND_COOKIE (frame, - ha_chown_cbk, + STACK_WIND_COOKIE (frame, + ha_setattr_cbk, (void *)(long)local->active, HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->chown, - loc, - uid, - gid); + HA_ACTIVE_CHILD(this, local)->fops->setattr, + loc, stbuf, valid); return 0; err: - STACK_UNWIND (frame, -1, ENOTCONN, NULL); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); return 0; } - int32_t -ha_fchown_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - buf); - } - return 0; -} - -int32_t -ha_fchown (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - uid_t uid, - gid_t gid) +int32_t +ha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid) { ha_local_t *local = NULL; int op_errno = 0; @@ -421,29 +336,29 @@ ha_fchown (call_frame_t *frame, goto err; } local = frame->local; - local->stub = fop_fchown_stub (frame, ha_fchown, fd, uid, gid); + local->stub = fop_fsetattr_stub (frame, ha_fsetattr, fd, stbuf, valid); STACK_WIND_COOKIE (frame, - ha_fchown_cbk, + ha_setattr_cbk, (void *)(long)local->active, HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->fchown, - fd, - uid, - gid); + HA_ACTIVE_CHILD(this, local)->fops->fsetattr, + fd, stbuf, valid); return 0; err: - STACK_UNWIND (frame, -1, op_errno, NULL); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); return 0; } - int32_t + +int32_t ha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *prebuf, + struct iatt *postbuf) { int ret = -1; @@ -452,7 +367,8 @@ ha_truncate_cbk (call_frame_t *frame, STACK_UNWIND (frame, op_ret, op_errno, - buf); + prebuf, + postbuf); } return 0; } @@ -473,6 +389,11 @@ ha_truncate (call_frame_t *frame, } local = frame->local; local->stub = fop_truncate_stub (frame, ha_truncate, loc, offset); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_truncate_cbk, @@ -483,7 +404,7 @@ ha_truncate (call_frame_t *frame, offset); return 0; err: - STACK_UNWIND (frame, -1, op_errno, NULL); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); return 0; } @@ -493,7 +414,8 @@ ha_ftruncate_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *prebuf, + struct iatt *postbuf) { int ret = -1; @@ -503,7 +425,7 @@ ha_ftruncate_cbk (call_frame_t *frame, STACK_UNWIND (frame, op_ret, op_errno, - buf); + prebuf, postbuf); } return 0; } @@ -524,6 +446,11 @@ ha_ftruncate (call_frame_t *frame, } local = frame->local; local->stub = fop_ftruncate_stub (frame, ha_ftruncate, fd, offset); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_ftruncate_cbk, @@ -534,58 +461,12 @@ ha_ftruncate (call_frame_t *frame, offset); return 0; err: - STACK_UNWIND (frame, -1, op_errno, NULL); - return 0; -} - -int32_t -ha_utimens_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - buf); - } - return 0; -} - -int32_t -ha_utimens (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct timespec tv[2]) -{ - ha_local_t *local = NULL; - int op_errno = 0; + local = frame->local; + frame->local = NULL; - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_utimens_stub (frame, ha_utimens, loc, tv); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - STACK_WIND_COOKIE (frame, - ha_utimens_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->utimens, - loc, - tv); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL); + ha_local_wipe (local); return 0; } @@ -624,6 +505,11 @@ ha_access (call_frame_t *frame, } local = frame->local; local->stub = fop_access_stub (frame, ha_access, loc, mask); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_access_cbk, @@ -634,7 +520,12 @@ ha_access (call_frame_t *frame, mask); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno); + + ha_local_wipe (local); return 0; } @@ -645,7 +536,8 @@ ha_readlink_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *path) + const char *path, + struct iatt *sbuf) { int ret = -1; @@ -655,7 +547,8 @@ ha_readlink_cbk (call_frame_t *frame, STACK_UNWIND (frame, op_ret, op_errno, - path); + path, + sbuf); } return 0; } @@ -686,7 +579,7 @@ ha_readlink (call_frame_t *frame, size); return 0; err: - STACK_UNWIND (frame, -1, op_errno, NULL); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); return 0; } @@ -697,8 +590,9 @@ ha_mknod_lookup_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf, - dict_t *dict) + struct iatt *buf, + dict_t *dict, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -742,12 +636,13 @@ ha_mknod_lookup_cbk (call_frame_t *frame, if (cnt == 0) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mknod.loc.inode, - &local->buf); + &local->buf, &local->preparent, + &local->postparent); call_stub_destroy (stub); } return 0; @@ -760,7 +655,9 @@ ha_mknod_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -798,6 +695,8 @@ ha_mknod_cbk (call_frame_t *frame, local->op_ret = 0; local->first_success = 1; local->buf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; } cnt = --local->call_count; for (i = local->active + 1; i < child_count; i++) { @@ -807,9 +706,11 @@ ha_mknod_cbk (call_frame_t *frame, if (cnt == 0 || i == child_count) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); stub = local->stub; - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mknod.loc.inode, &local->buf); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->stub->args.mknod.loc.inode, &local->buf, + &local->preparent, &local->postparent); call_stub_destroy (stub); return 0; } @@ -854,20 +755,46 @@ ha_mknod (call_frame_t *frame, ha_private_t *pvt = NULL; int child_count = 0, i = 0; char *stateino = NULL; + int32_t op_errno = EINVAL; local = frame->local; pvt = this->private; child_count = pvt->child_count; - frame->local = local = CALLOC (1, sizeof (*local)); + frame->local = local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + local->stub = fop_mknod_stub (frame, ha_mknod, loc, mode, rdev); + if (!local->stub) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; local->op_errno = ENOTCONN; - local->state = CALLOC (1, child_count); + local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!local->state) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + memcpy (local->state, pvt->state, child_count); local->active = -1; - stateino = CALLOC (1, child_count); + stateino = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!stateino) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); for (i = 0; i < child_count; i++) { @@ -884,6 +811,14 @@ ha_mknod (call_frame_t *frame, HA_ACTIVE_CHILD(this, local)->fops->mknod, loc, mode, rdev); return 0; + +err: + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); + ha_local_wipe (local); + return 0; } @@ -894,8 +829,9 @@ ha_mkdir_lookup_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf, - dict_t *dict) + struct iatt *buf, + dict_t *dict, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -931,12 +867,12 @@ ha_mkdir_lookup_cbk (call_frame_t *frame, if (cnt == 0) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->stub->args.mkdir.loc.inode, - &local->buf); + local->stub->args.mkdir.loc.inode, &local->buf, + &local->preparent, &local->postparent); call_stub_destroy (stub); } return 0; @@ -949,7 +885,9 @@ ha_mkdir_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -983,6 +921,8 @@ ha_mkdir_cbk (call_frame_t *frame, local->op_ret = 0; local->first_success = 1; local->buf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; } cnt = --local->call_count; for (i = local->active + 1; i < child_count; i++) { @@ -992,9 +932,11 @@ ha_mkdir_cbk (call_frame_t *frame, if (cnt == 0 || i == child_count) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); stub = local->stub; - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mkdir.loc.inode, &local->buf); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->stub->args.mkdir.loc.inode, &local->buf, + &local->preparent, &local->postparent); call_stub_destroy (stub); return 0; } @@ -1037,20 +979,46 @@ ha_mkdir (call_frame_t *frame, ha_private_t *pvt = NULL; int child_count = 0, i = 0; char *stateino = NULL; + int32_t op_errno = EINVAL; local = frame->local; pvt = this->private; child_count = pvt->child_count; - frame->local = local = CALLOC (1, sizeof (*local)); + frame->local = local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!frame->local) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + local->stub = fop_mkdir_stub (frame, ha_mkdir, loc, mode); + if (!local->stub) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; local->op_errno = ENOTCONN; - local->state = CALLOC (1, child_count); + local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!local->state) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + memcpy (local->state, pvt->state, child_count); local->active = -1; - stateino = CALLOC (1, child_count); + stateino = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!stateino) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); for (i = 0; i < child_count; i++) { if (local->state[i]) { @@ -1066,6 +1034,13 @@ ha_mkdir (call_frame_t *frame, HA_ACTIVE_CHILD(this, local)->fops->mkdir, loc, mode); return 0; +err: + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); + ha_local_wipe (local); + return 0; } int32_t @@ -1073,13 +1048,15 @@ ha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, + struct iatt *preparent, + struct iatt *postparent) { int ret = -1; ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); if (ret == 0) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent); } return 0; } @@ -1100,6 +1077,11 @@ ha_unlink (call_frame_t *frame, } local = frame->local; local->stub = fop_unlink_stub (frame, ha_unlink, loc); + if (!local->stub) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } STACK_WIND_COOKIE (frame, ha_unlink_cbk, @@ -1109,7 +1091,7 @@ ha_unlink (call_frame_t *frame, loc); return 0; err: - STACK_UNWIND (frame, -1, op_errno); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); return 0; } @@ -1118,7 +1100,9 @@ ha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, + struct iatt *preparent, + struct iatt *postparent) { int ret = -1; @@ -1127,7 +1111,9 @@ ha_rmdir_cbk (call_frame_t *frame, if (ret == 0) { STACK_UNWIND (frame, op_ret, - op_errno); + op_errno, + preparent, + postparent); } return 0; } @@ -1147,6 +1133,11 @@ ha_rmdir (call_frame_t *frame, } local = frame->local; local->stub = fop_rmdir_stub (frame, ha_rmdir, loc); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_rmdir_cbk, @@ -1156,7 +1147,7 @@ ha_rmdir (call_frame_t *frame, loc); return 0; err: - STACK_UNWIND (frame, -1, op_errno); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); return 0; } @@ -1168,8 +1159,9 @@ ha_symlink_lookup_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf, - dict_t *dict) + struct iatt *buf, + dict_t *dict, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -1205,12 +1197,12 @@ ha_symlink_lookup_cbk (call_frame_t *frame, if (cnt == 0) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->stub->args.symlink.loc.inode, - &local->buf); + local->stub->args.symlink.loc.inode, &local->buf, + &local->preparent, &local->postparent); call_stub_destroy (stub); } return 0; @@ -1223,7 +1215,9 @@ ha_symlink_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -1256,6 +1250,8 @@ ha_symlink_cbk (call_frame_t *frame, local->op_ret = 0; local->first_success = 1; local->buf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; } cnt = --local->call_count; for (i = local->active + 1; i < child_count; i++) { @@ -1265,10 +1261,11 @@ ha_symlink_cbk (call_frame_t *frame, if (cnt == 0 || i == child_count) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); stub = local->stub; STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->stub->args.symlink.loc.inode, &local->buf); + local->stub->args.symlink.loc.inode, &local->buf, + &local->preparent, &local->postparent); call_stub_destroy (stub); return 0; } @@ -1311,20 +1308,46 @@ ha_symlink (call_frame_t *frame, ha_private_t *pvt = NULL; int child_count = 0, i = 0; char *stateino = NULL; + int32_t op_errno = EINVAL; local = frame->local; pvt = this->private; child_count = pvt->child_count; - frame->local = local = CALLOC (1, sizeof (*local)); + frame->local = local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + local->stub = fop_symlink_stub (frame, ha_symlink, linkname, loc); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + local->op_ret = -1; local->op_errno = ENOTCONN; - local->state = CALLOC (1, child_count); + local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!local->state) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + memcpy (local->state, pvt->state, child_count); local->active = -1; - stateino = CALLOC (1, child_count); + stateino = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!stateino) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); for (i = 0; i < child_count; i++) { @@ -1342,6 +1365,12 @@ ha_symlink (call_frame_t *frame, HA_ACTIVE_CHILD(this, local)->fops->symlink, linkname, loc); return 0; +err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); + ha_local_wipe (local); + return 0; } int32_t @@ -1350,14 +1379,19 @@ ha_rename_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *buf, + struct iatt *preoldparent, + struct iatt *postoldparent, + struct iatt *prenewparent, + struct iatt *postnewparent) { int ret = -1; ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); if (ret == 0) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND (frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent); } return 0; } @@ -1378,6 +1412,12 @@ ha_rename (call_frame_t *frame, } local = frame->local; local->stub = fop_rename_stub (frame, ha_rename, oldloc, newloc); + if (!local->stub) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + STACK_WIND_COOKIE (frame, ha_rename_cbk, (void *)(long)local->active, @@ -1386,7 +1426,7 @@ ha_rename (call_frame_t *frame, oldloc, newloc); return 0; err: - STACK_UNWIND (frame, -1, op_errno, NULL); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -1397,8 +1437,9 @@ ha_link_lookup_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf, - dict_t *dict) + struct iatt *buf, + dict_t *dict, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -1434,12 +1475,12 @@ ha_link_lookup_cbk (call_frame_t *frame, if (cnt == 0) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->stub->args.link.oldloc.inode, - &local->buf); + local->stub->args.link.oldloc.inode, &local->buf, + &local->preparent, &local->postparent); call_stub_destroy (stub); } return 0; @@ -1452,7 +1493,9 @@ ha_link_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -1485,6 +1528,8 @@ ha_link_cbk (call_frame_t *frame, local->op_ret = 0; local->first_success = 1; local->buf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; } cnt = --local->call_count; for (i = local->active + 1; i < child_count; i++) { @@ -1494,9 +1539,11 @@ ha_link_cbk (call_frame_t *frame, if (cnt == 0 || i == child_count) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); stub = local->stub; - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.link.oldloc.inode, &local->buf); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->stub->args.link.oldloc.inode, &local->buf, + &local->preparent, &local->postparent); call_stub_destroy (stub); return 0; } @@ -1539,7 +1586,7 @@ ha_link (call_frame_t *frame, ha_private_t *pvt = NULL; int child_count = 0, i = 0; char *stateino = NULL; - int32_t ret = 0; + int32_t ret = 0, op_errno = 0; uint64_t tmp_stateino = 0; ret = inode_ctx_get (newloc->inode, this, &tmp_stateino); @@ -1551,7 +1598,8 @@ ha_link (call_frame_t *frame, if (stateino == NULL) { gf_log (this->name, GF_LOG_ERROR, "newloc->inode's ctx is NULL, returning EINVAL"); - STACK_UNWIND (frame, -1, EINVAL, oldloc->inode, NULL); + STACK_UNWIND (frame, -1, EINVAL, oldloc->inode, NULL, NULL, + NULL); return 0; } @@ -1559,11 +1607,30 @@ ha_link (call_frame_t *frame, pvt = this->private; child_count = pvt->child_count; - frame->local = local = CALLOC (1, sizeof (*local)); + frame->local = local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!frame->local) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + local->stub = fop_link_stub (frame, ha_link, oldloc, newloc); + if (!local->stub) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; local->op_errno = ENOTCONN; - local->state = CALLOC (1, child_count); + local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!local->state) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + memcpy (local->state, pvt->state, child_count); local->active = -1; @@ -1582,6 +1649,11 @@ ha_link (call_frame_t *frame, oldloc, newloc); return 0; +err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } int32_t @@ -1592,7 +1664,9 @@ ha_create_cbk (call_frame_t *frame, int32_t op_errno, fd_t *fd, inode_t *inode, - struct stat *buf) + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -1640,6 +1714,8 @@ ha_create_cbk (call_frame_t *frame, if (local->op_ret == -1) { local->op_ret = 0; local->buf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; local->first_success = 1; } local->stub->args.create.flags &= (~O_EXCL); @@ -1658,8 +1734,9 @@ ha_create_cbk (call_frame_t *frame, call_stub_t *stub = local->stub; STACK_UNWIND (frame, local->op_ret, local->op_errno, stub->args.create.fd, - stub->args.create.loc.inode, &local->buf); - FREE (state); + stub->args.create.loc.inode, &local->buf, + &local->preparent, &local->postparent); + GF_FREE (state); call_stub_destroy (stub); return 0; } @@ -1695,6 +1772,7 @@ ha_create (call_frame_t *frame, char *stateino = NULL; xlator_t **children = NULL; hafd_t *hafdp = NULL; + int32_t op_errno = EINVAL; local = frame->local; pvt = this->private; @@ -1702,9 +1780,28 @@ ha_create (call_frame_t *frame, children = pvt->children; if (local == NULL) { - local = frame->local = CALLOC (1, sizeof (*local)); + frame->local = local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + local->stub = fop_create_stub (frame, ha_create, loc, flags, mode, fd); - local->state = CALLOC (1, child_count); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!local->state) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + local->active = -1; local->op_ret = -1; local->op_errno = ENOTCONN; @@ -1718,10 +1815,34 @@ ha_create (call_frame_t *frame, } } /* FIXME handle active -1 */ - stateino = CALLOC (1, child_count); - hafdp = CALLOC (1, sizeof (*hafdp)); - hafdp->fdstate = CALLOC (1, child_count); - hafdp->path = strdup(loc->path); + stateino = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!stateino) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t); + if (!hafdp) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!hafdp->fdstate) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + hafdp->path = gf_strdup(loc->path); + if (!hafdp->path) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + LOCK_INIT (&hafdp->lock); fd_ctx_set (fd, this, (uint64_t)(long)hafdp); inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); @@ -1733,6 +1854,26 @@ ha_create (call_frame_t *frame, children[local->active]->fops->create, loc, flags, mode, fd); return 0; +err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + ha_local_wipe (local); + + if (stateino) { + GF_FREE (stateino); + stateino = NULL; + } + + if (hafdp) { + GF_FREE (hafdp->fdstate); + + GF_FREE (hafdp->path); + + GF_FREE (hafdp); + } + + return 0; } int32_t @@ -1789,7 +1930,7 @@ int32_t ha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd) + int32_t flags, fd_t *fd, int wbflags) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -1798,6 +1939,7 @@ ha_open (call_frame_t *frame, int cnt = 0, i, child_count = 0, ret = 0; hafd_t *hafdp = NULL; uint64_t tmp_stateino = 0; + int32_t op_errno = ENOMEM; local = frame->local; pvt = this->private; @@ -1805,14 +1947,39 @@ ha_open (call_frame_t *frame, child_count = pvt->child_count; - local = frame->local = CALLOC (1, sizeof (*local)); + frame->local = local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + local->op_ret = -1; local->op_errno = ENOTCONN; local->fd = fd; - hafdp = CALLOC (1, sizeof (*hafdp)); - hafdp->fdstate = CALLOC (1, child_count); - hafdp->path = strdup (loc->path); + hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t); + if (!hafdp) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!hafdp->fdstate) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + hafdp->path = gf_strdup (loc->path); + if (!hafdp->path) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + hafdp->active = -1; if (pvt->pref_subvol == -1) { hafdp->active = fd->inode->ino % child_count; @@ -1833,12 +2000,33 @@ ha_open (call_frame_t *frame, ha_open_cbk, children[i], children[i]->fops->open, - loc, flags, fd); + loc, flags, fd, wbflags); if (--cnt == 0) break; } } return 0; +err: + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, op_errno, fd); + if (hafdp) { + if (hafdp->fdstate) { + GF_FREE (hafdp->fdstate); + hafdp->fdstate = NULL; + } + + if (hafdp->path) { + GF_FREE (hafdp->path); + hafdp->path = NULL; + } + + GF_FREE (hafdp); + } + + ha_local_wipe (local); + return 0; } int32_t @@ -1849,7 +2037,7 @@ ha_readv_cbk (call_frame_t *frame, int32_t op_errno, struct iovec *vector, int32_t count, - struct stat *stbuf, + struct iatt *stbuf, struct iobref *iobref) { int ret = 0; @@ -1885,6 +2073,11 @@ ha_readv (call_frame_t *frame, } local = frame->local; local->stub = fop_readv_stub (frame, ha_readv, fd, size, offset); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_readv_cbk, @@ -1896,7 +2089,12 @@ ha_readv (call_frame_t *frame, offset); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL, NULL); + + ha_local_wipe (local); return 0; } @@ -1906,7 +2104,8 @@ ha_writev_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *stbuf) + struct iatt *prebuf, + struct iatt *postbuf) { int ret = 0; ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); @@ -1915,7 +2114,8 @@ ha_writev_cbk (call_frame_t *frame, STACK_UNWIND (frame, op_ret, op_errno, - stbuf); + prebuf, + postbuf); } return 0; } @@ -1938,7 +2138,13 @@ ha_writev (call_frame_t *frame, goto err; } local = frame->local; - local->stub = fop_writev_stub (frame, ha_writev, fd, vector, count, off, iobref); + local->stub = fop_writev_stub (frame, ha_writev, fd, vector, count, off, + iobref); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_writev_cbk, @@ -1952,7 +2158,12 @@ ha_writev (call_frame_t *frame, iobref); return 0; err: - STACK_UNWIND (frame, -1, op_errno, NULL); + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + ha_local_wipe (local); return 0; } @@ -1989,6 +2200,12 @@ ha_flush (call_frame_t *frame, } local = frame->local; local->stub = fop_flush_stub (frame, ha_flush, fd); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + STACK_WIND_COOKIE (frame, ha_flush_cbk, (void *)(long)local->active, @@ -1997,7 +2214,12 @@ ha_flush (call_frame_t *frame, fd); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno); + + ha_local_wipe (local); return 0; } @@ -2007,7 +2229,9 @@ ha_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf) { int ret = 0; ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); @@ -2036,6 +2260,12 @@ ha_fsync (call_frame_t *frame, } local = frame->local; local->stub = fop_fsync_stub (frame, ha_fsync, fd, flags); + if (!local->stub) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto err; + } + STACK_WIND_COOKIE (frame, ha_fsync_cbk, (void *)(long)local->active, @@ -2045,7 +2275,12 @@ ha_fsync (call_frame_t *frame, flags); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno); + + ha_local_wipe (local); return 0; } @@ -2055,7 +2290,7 @@ ha_fstat_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *buf) { int ret = 0; @@ -2086,6 +2321,12 @@ ha_fstat (call_frame_t *frame, } local = frame->local; local->stub = fop_fstat_stub (frame, ha_fstat, fd); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + STACK_WIND_COOKIE (frame, ha_fstat_cbk, (void *)(long)local->active, @@ -2094,7 +2335,12 @@ ha_fstat (call_frame_t *frame, fd); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL); + + ha_local_wipe (local); return 0; } @@ -2160,20 +2406,46 @@ ha_opendir (call_frame_t *frame, int cnt = 0, i, child_count = 0, ret = 0; hafd_t *hafdp = NULL; uint64_t tmp_stateino = 0; + int32_t op_errno = EINVAL; local = frame->local; pvt = this->private; children = pvt->children; child_count = pvt->child_count; - local = frame->local = CALLOC (1, sizeof (*local)); + frame->local = local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + local->op_ret = -1; local->op_errno = ENOTCONN; local->fd = fd; - hafdp = CALLOC (1, sizeof (*hafdp)); - hafdp->fdstate = CALLOC (1, child_count); - hafdp->path = strdup (loc->path); + hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t); + if (!hafdp) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!hafdp->fdstate) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + hafdp->path = gf_strdup (loc->path); + if (!hafdp->path) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + LOCK_INIT (&hafdp->lock); fd_ctx_set (fd, this, (uint64_t)(long)hafdp); ret = inode_ctx_get (loc->inode, this, &tmp_stateino); @@ -2198,6 +2470,26 @@ ha_opendir (call_frame_t *frame, } } return 0; +err: + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, op_errno, NULL); + ha_local_wipe (local); + if (hafdp) { + if (hafdp->fdstate) { + GF_FREE (hafdp->fdstate); + hafdp->fdstate = NULL; + } + + if (hafdp->path) { + GF_FREE (hafdp->path); + hafdp->path = NULL; + } + + GF_FREE (hafdp); + } + return 0; } int32_t @@ -2239,7 +2531,14 @@ ha_getdents (call_frame_t *frame, goto err; } local = frame->local; - local->stub = fop_getdents_stub (frame, ha_getdents, fd, size, offset, flag); + local->stub = fop_getdents_stub (frame, ha_getdents, fd, size, offset, + flag); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + STACK_WIND_COOKIE (frame, ha_getdents_cbk, (void *)(long)local->active, @@ -2251,7 +2550,12 @@ ha_getdents (call_frame_t *frame, flag); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL, 0); + + ha_local_wipe (local); return 0; } @@ -2292,7 +2596,13 @@ ha_setdents (call_frame_t *frame, } local = frame->local; - local->stub = fop_setdents_stub (frame, ha_setdents, fd, flags, entries, count); + local->stub = fop_setdents_stub (frame, ha_setdents, fd, flags, entries, + count); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_setdents_cbk, @@ -2305,7 +2615,12 @@ ha_setdents (call_frame_t *frame, count); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno); + + ha_local_wipe (local); return 0; } @@ -2343,6 +2658,12 @@ ha_fsyncdir (call_frame_t *frame, } local = frame->local; local->stub = fop_fsyncdir_stub (frame, ha_fsyncdir, fd, flags); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + STACK_WIND_COOKIE (frame, ha_fsyncdir_cbk, (void *)(long)local->active, @@ -2352,7 +2673,12 @@ ha_fsyncdir (call_frame_t *frame, flags); return 0; err: - STACK_UNWIND (frame, -1, op_errno); + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + ha_local_wipe (local); return 0; } @@ -2401,7 +2727,8 @@ ha_statfs (call_frame_t *frame, /* The normal way of handling failover doesn't work here * as loc->inode may be null in this case. */ - local = CALLOC (1, sizeof (*local)); + local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); if (!local) { op_errno = ENOMEM; goto err; @@ -2458,6 +2785,12 @@ ha_setxattr (call_frame_t *frame, } local = frame->local; local->stub = fop_setxattr_stub (frame, ha_setxattr, loc, dict, flags); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + STACK_WIND_COOKIE (frame, ha_setxattr_cbk, (void *)(long)local->active, @@ -2468,7 +2801,12 @@ ha_setxattr (call_frame_t *frame, flags); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno); + + ha_local_wipe (local); return 0; } @@ -2509,6 +2847,12 @@ ha_getxattr (call_frame_t *frame, } local = frame->local; local->stub = fop_getxattr_stub (frame, ha_getxattr, loc, name); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + STACK_WIND_COOKIE (frame, ha_getxattr_cbk, (void *)(long)local->active, @@ -2518,7 +2862,12 @@ ha_getxattr (call_frame_t *frame, name); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL); + + ha_local_wipe (local); return 0; } @@ -2557,6 +2906,11 @@ ha_xattrop (call_frame_t *frame, local = frame->local; local->stub = fop_xattrop_stub (frame, ha_xattrop, loc, flags, dict); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_xattrop_cbk, @@ -2568,7 +2922,12 @@ ha_xattrop (call_frame_t *frame, dict); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, dict); + + ha_local_wipe (local); return 0; } @@ -2604,6 +2963,11 @@ ha_fxattrop (call_frame_t *frame, } local = frame->local; local->stub = fop_fxattrop_stub (frame, ha_fxattrop, fd, flags, dict); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_fxattrop_cbk, @@ -2615,7 +2979,12 @@ ha_fxattrop (call_frame_t *frame, dict); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, dict); + + ha_local_wipe (local); return 0; } @@ -2653,6 +3022,11 @@ ha_removexattr (call_frame_t *frame, local = frame->local; local->stub = fop_removexattr_stub (frame, ha_removexattr, loc, name); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_removexattr_cbk, @@ -2663,7 +3037,12 @@ ha_removexattr (call_frame_t *frame, name); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno); + + ha_local_wipe (local); return 0; } @@ -2673,7 +3052,7 @@ ha_lk_setlk_unlck_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct flock *lock) + struct gf_flock *lock) { ha_local_t *local = NULL; int cnt = 0; @@ -2689,7 +3068,7 @@ ha_lk_setlk_unlck_cbk (call_frame_t *frame, if (cnt == 0) { stub = local->stub; - FREE (local->state); + GF_FREE (local->state); if (stub->args.lk.lock.l_type == F_UNLCK) { STACK_UNWIND (frame, local->op_ret, local->op_errno, &stub->args.lk.lock); } else { @@ -2706,7 +3085,7 @@ ha_lk_setlk_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct flock *lock) + struct gf_flock *lock) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -2738,7 +3117,7 @@ ha_lk_setlk_cbk (call_frame_t *frame, } if (i == child_count) { call_stub_t *stub = local->stub; - FREE (local->state); + GF_FREE (local->state); STACK_UNWIND (frame, 0, op_errno, &stub->args.lk.lock); call_stub_destroy (stub); return 0; @@ -2762,7 +3141,7 @@ ha_lk_setlk_cbk (call_frame_t *frame, cnt++; } if (cnt) { - struct flock lock; + struct gf_flock lock; lock = local->stub->args.lk.lock; for (i = 0; i < child_count; i++) { if (state[i]) { @@ -2779,7 +3158,7 @@ ha_lk_setlk_cbk (call_frame_t *frame, } return 0; } else { - FREE (local->state); + GF_FREE (local->state); call_stub_destroy (local->stub); STACK_UNWIND (frame, op_ret, @@ -2796,7 +3175,7 @@ ha_lk_getlk_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct flock *lock) + struct gf_flock *lock) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -2813,7 +3192,7 @@ ha_lk_getlk_cbk (call_frame_t *frame, prev_frame = cookie; if (op_ret == 0) { - FREE (local->state); + GF_FREE (local->state); call_stub_destroy (local->stub); STACK_UNWIND (frame, 0, 0, lock); return 0; @@ -2830,7 +3209,7 @@ ha_lk_getlk_cbk (call_frame_t *frame, } if (i == child_count) { - FREE (local->state); + GF_FREE (local->state); call_stub_destroy (local->stub); STACK_UNWIND (frame, op_ret, op_errno, lock); return 0; @@ -2851,7 +3230,7 @@ ha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct flock *lock) + struct gf_flock *lock) { ha_local_t *local = NULL; ha_private_t *pvt = NULL; @@ -2860,6 +3239,7 @@ ha_lk (call_frame_t *frame, int child_count = 0, i = 0, cnt = 0, ret = 0; xlator_t **children = NULL; uint64_t tmp_hafdp = 0; + int32_t op_errno = EINVAL; local = frame->local; pvt = this->private; @@ -2870,7 +3250,14 @@ ha_lk (call_frame_t *frame, gf_log (this->name, GF_LOG_ERROR, "fd_ctx_get failed"); if (local == NULL) { - local = frame->local = CALLOC (1, sizeof (*local)); + local = frame->local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + local->active = -1; local->op_ret = -1; local->op_errno = ENOTCONN; @@ -2878,12 +3265,24 @@ ha_lk (call_frame_t *frame, hafdp = (hafd_t *)(long)tmp_hafdp; if (local->active == -1) { - STACK_UNWIND (frame, -1, ENOTCONN, NULL); - return 0; + op_errno = ENOTCONN; + goto err; } local->stub = fop_lk_stub (frame, ha_lk, fd, cmd, lock); - local->state = CALLOC (1, child_count); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); + if (!local->state) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + state = hafdp->fdstate; LOCK (&hafdp->lock); memcpy (local->state, state, child_count); @@ -2931,6 +3330,14 @@ ha_lk (call_frame_t *frame, lock); } return 0; +err: + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, op_errno, NULL); + + ha_local_wipe (local); + return 0; } int32_t @@ -2957,7 +3364,7 @@ ha_inodelk (call_frame_t *frame, const char *volume, loc_t *loc, int32_t cmd, - struct flock *lock) + struct gf_flock *lock) { ha_local_t *local = NULL; int op_errno = 0; @@ -3055,6 +3462,11 @@ ha_checksum (call_frame_t *frame, } local = frame->local; local->stub = fop_checksum_stub (frame, ha_checksum, loc, flag); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } STACK_WIND_COOKIE (frame, ha_checksum_cbk, @@ -3065,17 +3477,18 @@ ha_checksum (call_frame_t *frame, flag); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + ha_local_wipe (local); return 0; } int32_t -ha_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - gf_dirent_t *entries) +ha_common_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) { int ret = 0; @@ -3086,11 +3499,15 @@ ha_readdir_cbk (call_frame_t *frame, } int32_t -ha_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off) +ha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off); + +int32_t +ha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off); +int32_t +ha_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, int whichop) { ha_local_t *local = NULL; int op_errno = 0; @@ -3101,19 +3518,58 @@ ha_readdir (call_frame_t *frame, goto err; } local = frame->local; - local->stub = fop_readdir_stub (frame, ha_readdir, fd, size, off); - STACK_WIND_COOKIE (frame, - ha_readdir_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->readdir, - fd, size, off); + if (whichop == GF_FOP_READDIR) + local->stub = fop_readdir_stub (frame, ha_readdir, fd, size, + off); + else + local->stub = fop_readdirp_stub (frame, ha_readdirp, fd, size, + off); + if (!local->stub) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } + + if (whichop == GF_FOP_READDIR) + STACK_WIND_COOKIE (frame, ha_common_readdir_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->readdir, + fd, size, off); + else + STACK_WIND_COOKIE (frame, ha_common_readdir_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->readdirp, + fd, size, off); return 0; err: + local = frame->local; + frame->local = NULL; + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + + ha_local_wipe (local); return 0; } + +int32_t +ha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off) +{ + ha_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR); + return 0; +} + +int32_t +ha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off) +{ + ha_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP); + return 0; +} + /* Management operations */ int32_t @@ -3174,8 +3630,16 @@ ha_stats (call_frame_t *frame, ha_private_t *pvt = NULL; xlator_t **children = NULL; int i = 0; + int32_t op_errno = EINVAL; + + local = frame->local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } - local = frame->local = CALLOC (1, sizeof (*local)); pvt = this->private; children = pvt->children; for (i = 0; i < pvt->child_count; i++) { @@ -3184,8 +3648,8 @@ ha_stats (call_frame_t *frame, } if (i == pvt->child_count) { - STACK_UNWIND (frame, -1, ENOTCONN, NULL); - return 0; + op_errno = ENOTCONN; + goto err; } local->flags = flags; @@ -3195,6 +3659,16 @@ ha_stats (call_frame_t *frame, children[i]->mops->stats, flags); return 0; + +err: + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + + ha_local_wipe (local); + return 0; + } @@ -3258,20 +3732,27 @@ ha_getspec (call_frame_t *frame, ha_private_t *pvt = NULL; xlator_t **children = NULL; int i = 0; + int32_t op_errno = EINVAL; + + local = frame->local = GF_CALLOC (1, sizeof (*local), + gf_ha_mt_ha_local_t); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto err; + } - local = frame->local = CALLOC (1, sizeof (*local)); pvt = this->private; children = pvt->children; - local = frame->local = CALLOC (1, sizeof (*local)); for (i = 0; i < pvt->child_count; i++) { if (pvt->state[i]) break; } if (i == pvt->child_count) { - STACK_UNWIND (frame, -1, ENOTCONN, NULL); - return 0; + op_errno = ENOTCONN; + goto err; } local->flags = flags; local->pattern = (char *)key; @@ -3282,6 +3763,15 @@ ha_getspec (call_frame_t *frame, children[i]->mops->getspec, key, flags); return 0; +err: + local = frame->local; + frame->local = NULL; + + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + + ha_local_wipe (local); + return 0; + } int32_t @@ -3299,8 +3789,8 @@ ha_closedir (xlator_t *this, } hafdp = (hafd_t *)(long)tmp_hafdp; - FREE (hafdp->fdstate); - FREE (hafdp->path); + GF_FREE (hafdp->fdstate); + GF_FREE (hafdp->path); LOCK_DESTROY (&hafdp->lock); return 0; } @@ -3320,8 +3810,8 @@ ha_close (xlator_t *this, } hafdp = (hafd_t *)(long)tmp_hafdp; - FREE (hafdp->fdstate); - FREE (hafdp->path); + GF_FREE (hafdp->fdstate); + GF_FREE (hafdp->path); LOCK_DESTROY (&hafdp->lock); return 0; } @@ -3392,6 +3882,25 @@ notify (xlator_t *this, return 0; } +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_ha_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + int init (xlator_t *this) { @@ -3399,6 +3908,7 @@ init (xlator_t *this) xlator_list_t *trav = NULL; int count = 0, ret = 0; + if (!this->children) { gf_log (this->name,GF_LOG_ERROR, "FATAL: ha should have one or more child defined"); @@ -3411,7 +3921,7 @@ init (xlator_t *this) } trav = this->children; - pvt = CALLOC (1, sizeof (ha_private_t)); + pvt = GF_CALLOC (1, sizeof (ha_private_t), gf_ha_mt_ha_private_t); ret = dict_get_int32 (this->options, "preferred-subvolume", &pvt->pref_subvol); @@ -3426,7 +3936,8 @@ init (xlator_t *this) } pvt->child_count = count; - pvt->children = CALLOC (count, sizeof (xlator_t*)); + pvt->children = GF_CALLOC (count, sizeof (xlator_t*), + gf_ha_mt_xlator_t); trav = this->children; count = 0; @@ -3436,7 +3947,7 @@ init (xlator_t *this) trav = trav->next; } - pvt->state = CALLOC (1, count); + pvt->state = GF_CALLOC (1, count, gf_ha_mt_char); this->private = pvt; return 0; } @@ -3446,7 +3957,7 @@ fini (xlator_t *this) { ha_private_t *priv = NULL; priv = this->private; - FREE (priv); + GF_FREE (priv); return; } @@ -3462,10 +3973,7 @@ struct xlator_fops fops = { .symlink = ha_symlink, .rename = ha_rename, .link = ha_link, - .chmod = ha_chmod, - .chown = ha_chown, .truncate = ha_truncate, - .utimens = ha_utimens, .create = ha_create, .open = ha_open, .readv = ha_readv, @@ -3478,24 +3986,20 @@ struct xlator_fops fops = { .removexattr = ha_removexattr, .opendir = ha_opendir, .readdir = ha_readdir, + .readdirp = ha_readdirp, .getdents = ha_getdents, .fsyncdir = ha_fsyncdir, .access = ha_access, .ftruncate = ha_ftruncate, .fstat = ha_fstat, .lk = ha_lk, - .fchmod = ha_fchmod, - .fchown = ha_fchown, .setdents = ha_setdents, .lookup_cbk = ha_lookup_cbk, .checksum = ha_checksum, .xattrop = ha_xattrop, - .fxattrop = ha_fxattrop -}; - -struct xlator_mops mops = { - .stats = ha_stats, - .getspec = ha_getspec, + .fxattrop = ha_fxattrop, + .setattr = ha_setattr, + .fsetattr = ha_fsetattr, }; struct xlator_cbks cbks = { diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h index d47d96535..e2ed7eaa6 100644 --- a/xlators/cluster/ha/src/ha.h +++ b/xlators/cluster/ha/src/ha.h @@ -1,25 +1,17 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef __HA_H_ #define __HA_H_ +#include "ha-mem-types.h" + typedef struct { call_stub_t *stub; int32_t op_ret, op_errno; @@ -28,7 +20,9 @@ typedef struct { char *state, *pattern; dict_t *dict; loc_t loc; - struct stat buf; + struct iatt buf; + struct iatt postparent; + struct iatt preparent; fd_t *fd; inode_t *inode; int32_t flags; diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am index 26e19137a..a278b05e2 100644 --- a/xlators/cluster/map/src/Makefile.am +++ b/xlators/cluster/map/src/Makefile.am @@ -1,15 +1,16 @@ xlator_LTLIBRARIES = map.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster -map_la_LDFLAGS = -module -avoidversion +map_la_LDFLAGS = -module -avoid-version map_la_SOURCES = map.c map-helper.c map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = map.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c index 365eeb490..851397b68 100644 --- a/xlators/cluster/map/src/map-helper.c +++ b/xlators/cluster/map/src/map-helper.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2009-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2009-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -256,14 +246,15 @@ verify_dir_and_assign_subvol (xlator_t *this, goto out; } - tmp_map = CALLOC (1, sizeof (struct map_pattern)); + tmp_map = GF_CALLOC (1, sizeof (struct map_pattern), + gf_map_mt_map_pattern); tmp_map->xl = trav->xlator; tmp_map->dir_len = strlen (directory); /* make sure that the top level directory starts * with '/' and ends without '/' */ - tmp_map->directory = strdup (directory); + tmp_map->directory = gf_strdup (directory); if (directory[tmp_map->dir_len - 1] == '/') { tmp_map->dir_len--; } diff --git a/xlators/cluster/map/src/map-mem-types.h b/xlators/cluster/map/src/map-mem-types.h new file mode 100644 index 000000000..3e89f4736 --- /dev/null +++ b/xlators/cluster/map/src/map-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __MAP_MEM_TYPES_H__ +#define __MAP_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_map_mem_types_ { + gf_map_mt_map_private_t = gf_common_mt_end + 1, + gf_map_mt_map_local_t, + gf_map_mt_map_xlator_array, + gf_map_mt_map_pattern, + gf_map_mt_end +}; +#endif + diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c index 98d6b33b0..6150a33ce 100644 --- a/xlators/cluster/map/src/map.c +++ b/xlators/cluster/map/src/map.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -36,87 +26,53 @@ map_stat_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *buf) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); STACK_UNWIND (frame, op_ret, op_errno, buf); return 0; } - static int32_t -map_chmod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) +map_setattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *statpre, + struct iatt *statpost) { call_frame_t *prev = NULL; prev = cookie; - - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); - - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} + map_itransform (this, prev->this, statpre->ia_ino, &statpre->ia_ino); + map_itransform (this, prev->this, statpost->ia_ino, &statpost->ia_ino); -static int32_t -map_fchmod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); - - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost); return 0; } - static int32_t -map_chown_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) +map_fsetattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *statpre, + struct iatt *statpost) { call_frame_t *prev = NULL; prev = cookie; - - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); - - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} + map_itransform (this, prev->this, statpre->ia_ino, &statpre->ia_ino); + map_itransform (this, prev->this, statpost->ia_ino, &statpost->ia_ino); -static int32_t -map_fchown_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); - - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost); return 0; } @@ -126,14 +82,15 @@ map_truncate_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *prebuf, + struct iatt *postbuf) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino); - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); return 0; } @@ -143,31 +100,15 @@ map_ftruncate_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); - - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -map_utimens_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) + struct iatt *prebuf, + struct iatt *postbuf) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino); - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); return 0; } @@ -189,9 +130,10 @@ map_readlink_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *path) + const char *path, + struct iatt *sbuf) { - STACK_UNWIND (frame, op_ret, op_errno, path); + STACK_UNWIND (frame, op_ret, op_errno, path, sbuf); return 0; } @@ -200,9 +142,11 @@ map_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, + struct iatt *preparent, + struct iatt *postparent) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent); return 0; } @@ -211,9 +155,11 @@ map_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, + struct iatt *preparent, + struct iatt *postparent) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent); return 0; } @@ -224,12 +170,16 @@ map_rename_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *buf, + struct iatt *preoldparent, + struct iatt *postoldparent, + struct iatt *prenewparent, + struct iatt *postnewparent) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); STACK_UNWIND (frame, op_ret, op_errno, buf); return 0; @@ -242,12 +192,14 @@ map_link_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); STACK_UNWIND (frame, op_ret, op_errno, inode, buf); return 0; @@ -273,13 +225,13 @@ map_readv_cbk (call_frame_t *frame, int32_t op_errno, struct iovec *vector, int32_t count, - struct stat *stbuf, + struct iatt *stbuf, struct iobref *iobref) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + map_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino); STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); return 0; @@ -291,14 +243,15 @@ map_writev_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *stbuf) + struct iatt *prebuf, + struct iatt *postbuf) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino); - STACK_UNWIND (frame, op_ret, op_errno, stbuf); + STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); return 0; } @@ -319,9 +272,11 @@ map_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); return 0; } @@ -332,12 +287,12 @@ map_fstat_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct stat *buf) + struct iatt *buf) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); STACK_UNWIND (frame, op_ret, op_errno, buf); return 0; @@ -473,7 +428,7 @@ map_lk_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct flock *lock) + struct gf_flock *lock) { STACK_UNWIND (frame, op_ret, op_errno, lock); return 0; @@ -527,12 +482,14 @@ map_newentry_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); STACK_UNWIND (frame, op_ret, op_errno, inode, buf); return 0; @@ -548,12 +505,14 @@ map_create_cbk (call_frame_t *frame, int32_t op_errno, fd_t *fd, inode_t *inode, - struct stat *buf) + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); return 0; @@ -655,13 +614,14 @@ map_single_lookup_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf, - dict_t *dict) + struct iatt *buf, + dict_t *dict, + struct iatt *postparent) { call_frame_t *prev = NULL; prev = cookie; - map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict); @@ -675,8 +635,9 @@ map_root_lookup_cbk (call_frame_t *frame, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf, - dict_t *dict) + struct iatt *buf, + dict_t *dict, + struct iatt *postparent) { int callcnt = 0; map_local_t *local = NULL; @@ -779,9 +740,36 @@ map_single_readdir_cbk (call_frame_t *frame, } +int32_t +map_single_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) +{ + call_frame_t *prev = NULL; + gf_dirent_t *orig_entry = NULL; + + prev = cookie; + + list_for_each_entry (orig_entry, &entries->list, list) { + map_itransform (this, prev->this, orig_entry->d_ino, + &orig_entry->d_ino); + orig_entry->d_stat.ia_ino = orig_entry->d_ino; + } + STACK_UNWIND (frame, op_ret, op_errno, entries); + return 0; +} + int map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries) + int op_ret, int op_errno, gf_dirent_t *orig_entries); + +int +map_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries); + +int +map_generic_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries, + int whichop) { map_local_t *local = NULL; gf_dirent_t entries; @@ -816,6 +804,8 @@ map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, map_itransform (this, subvol, orig_entry->d_off, &entry->d_off); + if (whichop == GF_FOP_READDIRP) + entry->d_stat.ia_ino = entry->d_ino; entry->d_type = orig_entry->d_type; entry->d_len = orig_entry->d_len; @@ -841,9 +831,14 @@ done: goto unwind; } - STACK_WIND (frame, map_readdir_cbk, - next_subvol, next_subvol->fops->readdir, - local->fd, local->size, 0); + if (whichop == GF_FOP_READDIR) + STACK_WIND (frame, map_readdir_cbk, next_subvol, + next_subvol->fops->readdir, local->fd, + local->size, 0); + else + STACK_WIND (frame, map_readdirp_cbk, next_subvol, + next_subvol->fops->readdirp, local->fd, + local->size, 0); return 0; } @@ -864,6 +859,26 @@ unwind: } +int +map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries) +{ + map_generic_readdir_cbk (frame, cookie, this, op_ret, op_errno, + orig_entries, GF_FOP_READDIR); + return 0; +} + + +int +map_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries) +{ + map_generic_readdir_cbk (frame, cookie, this, op_ret, op_errno, + orig_entries, GF_FOP_READDIRP); + return 0; +} + + /* Management operations */ static int32_t @@ -880,23 +895,6 @@ map_checksum_cbk (call_frame_t *frame, } -int32_t -map_lock_notify_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -int32_t -map_lock_fnotify_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - /* Fops starts here */ int32_t @@ -928,21 +926,22 @@ map_stat (call_frame_t *frame, return 0; } - int32_t -map_chmod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) +map_setattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct iatt *stbuf, + int32_t valid) { int32_t op_errno = 1; xlator_t *subvol = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + GF_VALIDATE_OR_GOTO ("map", this, err); + GF_VALIDATE_OR_GOTO (this->name, frame, err); + GF_VALIDATE_OR_GOTO (this->name, loc, err); + GF_VALIDATE_OR_GOTO (this->name, loc->inode, err); + GF_VALIDATE_OR_GOTO (this->name, loc->path, err); + GF_VALIDATE_OR_GOTO (this->name, stbuf, err); subvol = get_mapping_subvol_from_ctx (this, loc->inode); if (!subvol) { @@ -950,8 +949,8 @@ map_chmod (call_frame_t *frame, goto err; } - STACK_WIND (frame, map_chmod_cbk, subvol, - subvol->fops->chmod, loc, mode); + STACK_WIND (frame, map_setattr_cbk, subvol, + subvol->fops->setattr, loc, stbuf, valid); return 0; err: STACK_UNWIND (frame, -1, op_errno, NULL, NULL); @@ -960,18 +959,19 @@ map_chmod (call_frame_t *frame, } int32_t -map_fchmod (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - mode_t mode) +map_fsetattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iatt *stbuf, + int32_t valid) { int32_t op_errno = 1; xlator_t *subvol = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); + GF_VALIDATE_OR_GOTO ("map", this, err); + GF_VALIDATE_OR_GOTO (this->name, frame, err); + GF_VALIDATE_OR_GOTO (this->name, fd, err); + GF_VALIDATE_OR_GOTO (this->name, stbuf, err); subvol = get_mapping_subvol_from_ctx (this, fd->inode); if (!subvol) { @@ -979,71 +979,8 @@ map_fchmod (call_frame_t *frame, goto err; } - STACK_WIND (frame, map_fchmod_cbk, subvol, - subvol->fops->fchmod, fd, mode); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_chown (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - uid_t uid, - gid_t gid) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_chown_cbk, subvol, - subvol->fops->chown, loc, uid, gid); - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_fchown (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - uid_t uid, - gid_t gid) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fchown_cbk, subvol, - subvol->fops->fchown, fd, uid, gid); - + STACK_WIND (frame, map_fsetattr_cbk, subvol, + subvol->fops->fsetattr, fd, stbuf, valid); return 0; err: STACK_UNWIND (frame, -1, op_errno, NULL, NULL); @@ -1113,37 +1050,6 @@ map_ftruncate (call_frame_t *frame, } int32_t -map_utimens (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct timespec tv[2]) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_utimens_cbk, subvol, - subvol->fops->utimens, loc, tv); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t map_access (call_frame_t *frame, xlator_t *this, loc_t *loc, @@ -1353,7 +1259,7 @@ int32_t map_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd) + int32_t flags, fd_t *fd, int wbflags) { int32_t op_errno = 1; xlator_t *subvol = NULL; @@ -1371,7 +1277,7 @@ map_open (call_frame_t *frame, } STACK_WIND (frame, map_open_cbk, subvol, - subvol->fops->open, loc, flags, fd); + subvol->fops->open, loc, flags, fd, wbflags); return 0; err: @@ -1854,7 +1760,7 @@ map_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct flock *lock) + struct gf_flock *lock) { int32_t op_errno = 1; xlator_t *subvol = NULL; @@ -1883,7 +1789,7 @@ map_lk (call_frame_t *frame, int32_t map_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct flock *lock) + const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) { int32_t op_errno = 1; xlator_t *subvol = NULL; @@ -1913,7 +1819,7 @@ map_inodelk (call_frame_t *frame, xlator_t *this, int32_t map_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct flock *lock) + const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) { int32_t op_errno = 1; xlator_t *subvol = NULL; @@ -2231,7 +2137,8 @@ map_lookup (call_frame_t *frame, return 0; root_inode: - local = CALLOC (1, sizeof (map_local_t)); + local = GF_CALLOC (1, sizeof (map_local_t), + gf_map_mt_map_local_t); frame->local = local; local->call_count = priv->child_count; @@ -2283,7 +2190,8 @@ map_statfs (call_frame_t *frame, return 0; root_inode: - local = CALLOC (1, sizeof (map_local_t)); + local = GF_CALLOC (1, sizeof (map_local_t), + gf_map_mt_map_local_t); priv = this->private; frame->local = local; @@ -2335,7 +2243,8 @@ map_opendir (call_frame_t *frame, return 0; root_inode: - local = CALLOC (1, sizeof (map_local_t)); + local = GF_CALLOC (1, sizeof (map_local_t), + gf_map_mt_map_local_t); priv = this->private; frame->local = local; @@ -2359,11 +2268,8 @@ map_opendir (call_frame_t *frame, int32_t -map_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t yoff) +map_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, int whichop) { int32_t op_errno = EINVAL; xlator_t *subvol = NULL; @@ -2386,14 +2292,19 @@ map_readdir (call_frame_t *frame, } /* Just one callback */ - STACK_WIND (frame, map_single_readdir_cbk, subvol, - subvol->fops->readdir, fd, size, yoff); + if (whichop == GF_FOP_READDIR) + STACK_WIND (frame, map_single_readdir_cbk, subvol, + subvol->fops->readdir, fd, size, yoff); + else + STACK_WIND (frame, map_single_readdirp_cbk, subvol, + subvol->fops->readdirp, fd, size, yoff); return 0; root_inode: /* readdir on '/' */ - local = CALLOC (1, sizeof (map_local_t)); + local = GF_CALLOC (1, sizeof (map_local_t), + gf_map_mt_map_local_t); if (!local) { gf_log (this->name, GF_LOG_ERROR, "memory allocation failed :("); @@ -2411,8 +2322,12 @@ map_readdir (call_frame_t *frame, map_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); - STACK_WIND (frame, map_readdir_cbk, xvol, - xvol->fops->readdir, fd, size, xoff); + if (whichop == GF_FOP_READDIR) + STACK_WIND (frame, map_readdir_cbk, xvol, xvol->fops->readdir, + fd, size, xoff); + else + STACK_WIND (frame, map_readdirp_cbk, xvol, xvol->fops->readdirp, + fd, size, xoff); return 0; err: @@ -2422,6 +2337,24 @@ map_readdir (call_frame_t *frame, } + +int32_t +map_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff) +{ + map_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIR); + return 0; +} + +int32_t +map_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff) +{ + map_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP); + return 0; +} + + void fini (xlator_t *this) { @@ -2432,22 +2365,40 @@ fini (xlator_t *this) priv = this->private; if (priv) { - if (priv->xlarray) - FREE (priv->xlarray); + GF_FREE (priv->xlarray); trav_map = priv->map; while (trav_map) { tmp_map = trav_map; trav_map = trav_map->next; - FREE (tmp_map); + GF_FREE (tmp_map); } - FREE(priv); + GF_FREE(priv); } return; } +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_map_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + int init (xlator_t *this) { @@ -2464,6 +2415,7 @@ init (xlator_t *this) char *subvol_str = NULL; char *map_xl = NULL; + if (!this->children) { gf_log (this->name,GF_LOG_ERROR, "FATAL: map should have one or more child defined"); @@ -2475,7 +2427,8 @@ init (xlator_t *this) "dangling volume. check volfile "); } - priv = CALLOC (1, sizeof (map_private_t)); + priv = GF_CALLOC (1, sizeof (map_private_t), + gf_map_mt_map_private_t); this->private = priv; /* allocate xlator array */ @@ -2484,7 +2437,8 @@ init (xlator_t *this) count++; trav = trav->next; } - priv->xlarray = CALLOC (1, sizeof (struct map_xlator_array) * count); + priv->xlarray = GF_CALLOC (1, sizeof (struct map_xlator_array) * count, + gf_map_mt_map_xlator_array); priv->child_count = count; /* build xlator array */ @@ -2504,7 +2458,7 @@ init (xlator_t *this) } map_pair_str = strtok_r (pattern_string, ";", &tmp_str); while (map_pair_str) { - dup_map_pair = strdup (map_pair_str); + dup_map_pair = gf_strdup (map_pair_str); dir_str = strtok_r (dup_map_pair, ":", &tmp_str1); if (!dir_str) { gf_log (this->name, GF_LOG_ERROR, @@ -2526,7 +2480,7 @@ init (xlator_t *this) goto err; } - FREE (dup_map_pair); + GF_FREE (dup_map_pair); map_pair_str = strtok_r (NULL, ";", &tmp_str); } @@ -2557,12 +2511,7 @@ struct xlator_fops fops = { .create = map_create, .stat = map_stat, - .chmod = map_chmod, - .chown = map_chown, - .fchown = map_fchown, - .fchmod = map_fchmod, .fstat = map_fstat, - .utimens = map_utimens, .truncate = map_truncate, .ftruncate = map_ftruncate, .access = map_access, @@ -2581,6 +2530,7 @@ struct xlator_fops fops = { .lk = map_lk, .opendir = map_opendir, .readdir = map_readdir, + .readdirp = map_readdirp, .fsyncdir = map_fsyncdir, .symlink = map_symlink, .unlink = map_unlink, @@ -2597,9 +2547,8 @@ struct xlator_fops fops = { .setdents = map_setdents, .getdents = map_getdents, .checksum = map_checksum, -}; - -struct xlator_mops mops = { + .setattr = map_setattr, + .fsetattr = map_fsetattr, }; struct xlator_cbks cbks = { diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h index 72b4f5640..7703a543e 100644 --- a/xlators/cluster/map/src/map.h +++ b/xlators/cluster/map/src/map.h @@ -1,26 +1,17 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __MAP_H__ #define __MAP_H__ #include "xlator.h" +#include "map-mem-types.h" struct map_pattern { struct map_pattern *next; @@ -46,7 +37,7 @@ typedef struct { int32_t op_errno; int call_count; struct statvfs statvfs; - struct stat stbuf; + struct iatt stbuf; inode_t *inode; dict_t *dict; fd_t *fd; diff --git a/xlators/cluster/unify/Makefile.am b/xlators/cluster/nsr-client/Makefile.am index d471a3f92..d471a3f92 100644 --- a/xlators/cluster/unify/Makefile.am +++ b/xlators/cluster/nsr-client/Makefile.am diff --git a/xlators/cluster/nsr-client/src/Makefile.am b/xlators/cluster/nsr-client/src/Makefile.am new file mode 100644 index 000000000..4541ea01a --- /dev/null +++ b/xlators/cluster/nsr-client/src/Makefile.am @@ -0,0 +1,33 @@ +noinst_PYTHON = gen-fops.py + +xlator_LTLIBRARIES = nsrc.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +nsrc_la_LDFLAGS = -module -avoid-version +nsrc_la_SOURCES = nsrc.c + +nsrc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = fop-template.c \ + $(top_srcdir)/xlators/lib/src/libxlator.h \ + $(top_srcdir)/glusterfsd/src/glusterfsd.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h + +CLEANFILES = nsrc-cg.c + +CODEGEN_DIR = ../../nsr-server/src/codegen.py + +nsrc-cg.c: gen-fops.py $(CODEGEN) $(XLATOR_HEADER) fop-template.c + $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) fop-template.c > $@ + +nsrc.lo: nsrc-cg.c + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/nsr.so diff --git a/xlators/cluster/nsr-client/src/fop-template.c b/xlators/cluster/nsr-client/src/fop-template.c new file mode 100644 index 000000000..699b07d40 --- /dev/null +++ b/xlators/cluster/nsr-client/src/fop-template.c @@ -0,0 +1,113 @@ +// template-name fop +$TYPE$ +nsrc_$NAME$ (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsrc_local_t *local = NULL; + xlator_t *target_xl = ACTIVE_CHILD(this); + + local = mem_get(this->local_pool); + if (!local) { + goto err; + } + + local->stub = fop_$NAME$_stub (frame, nsrc_$NAME$_continue, + $ARGS_SHORT$); + if (!local->stub) { + goto err; + } + local->curr_xl = target_xl; + local->scars = 0; + + frame->local = local; + STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, target_xl, + target_xl, target_xl->fops->$NAME$, + $ARGS_SHORT$); + return 0; + +err: + if (local) { + mem_put(local); + } + STACK_UNWIND_STRICT ($NAME$, frame, -1, ENOMEM, + $DEFAULTS$); + return 0; +} + +// template-name cbk +$TYPE$ +nsrc_$NAME$_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + $ARGS_LONG$) +{ + nsrc_local_t *local = frame->local; + xlator_t *last_xl = cookie; + xlator_t *next_xl; + nsrc_private_t *priv = this->private; + struct timespec spec; + + if (op_ret != (-1)) { + if (local->scars) { + gf_log (this->name, GF_LOG_INFO, + HILITE("retried %p OK"), frame->local); + } + priv->active = last_xl; + goto unwind; + } + if ((op_errno != EREMOTE) && (op_errno != ENOTCONN)) { + goto unwind; + } + + /* TBD: get leader ID from xdata? */ + next_xl = next_xlator(this,last_xl); + /* + * We can't just give up after we've tried all bricks, because it's + * quite likely that a new leader election just hasn't finished yet. + * We also shouldn't retry endlessly, and especially not at a high + * rate, but that's good enough while we work on other things. + * + * TBD: implement slow/finite retry via a worker thread + */ + if (!next_xl || (local->scars >= SCAR_LIMIT)) { + gf_log (this->name, GF_LOG_DEBUG, + HILITE("ran out of retries for %p"), frame->local); + goto unwind; + } + + local->curr_xl = next_xl; + local->scars += 1; + spec.tv_sec = 1; + spec.tv_nsec = 0; + /* + * WARNING + * + * Just calling gf_timer_call_after like this leaves open the + * possibility that writes will get reordered, if a first write is + * rescheduled and then a second comes along to find an updated + * priv->active before the first actually executes. We might need to + * implement a stricter (and more complicated) queuing mechanism to + * ensure absolute consistency in this case. + */ + if (gf_timer_call_after(this->ctx,spec,nsrc_retry_cb,local)) { + return 0; + } + +unwind: + call_stub_destroy(local->stub); + STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno, + $ARGS_SHORT$); + return 0; +} + +// template-name cont-func +$TYPE$ +nsrc_$NAME$_continue (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsrc_local_t *local = frame->local; + + STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, local->curr_xl, + local->curr_xl, local->curr_xl->fops->$NAME$, + $ARGS_SHORT$); + return 0; +} diff --git a/xlators/cluster/nsr-client/src/gen-fops.py b/xlators/cluster/nsr-client/src/gen-fops.py new file mode 100755 index 000000000..b07b3c5b1 --- /dev/null +++ b/xlators/cluster/nsr-client/src/gen-fops.py @@ -0,0 +1,57 @@ +#!/usr/bin/python + +# This script generates the boilerplate versions of most fops in the client, +# mostly so that we can use STACK_WIND instead of STACK_WIND_TAIL (see +# fop-template.c for the details). The problem we're solving is that we sit +# under DHT, which makes assumptions about getting callbacks only from its +# direct children. If we didn't define our own versions of these fops, the +# default versions would use STACK_WIND_TAIL and the callbacks would come from +# DHT's grandchildren. The code-generation approach allows us to handle this +# with a minimum of code, and also keep up with any changes to the fop table. + +import sys +sys.path.append("../../nsr-server/src") # Blech. +import codegen + +type_re = "([a-z_0-9]+)" +name_re = "\(\*fop_([a-z0-9]+)_t\)" +full_re = type_re + " *" + name_re +fop_cg = codegen.CodeGenerator() +fop_cg.skip = 2 +fop_cg.parse_decls(sys.argv[1],full_re) +fop_cg.load_templates(sys.argv[2]) + +# Use the multi-template feature to generate multiple callbacks from the same +# parsed declarations. +type_re = "([a-z_0-9]+)" +name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)" +full_re = type_re + " *" + name_re +cbk_cg = codegen.CodeGenerator() +cbk_cg.skip = 5 +cbk_cg.parse_decls(sys.argv[1],full_re) +cbk_cg.load_templates(sys.argv[2]) + +# This is a nasty little trick to handle the case where a generated fop needs +# a set of default arguments for the corresponding callback. +# +# Yes, it's ironic that I'm copying and pasting the generator code. +fop_cg.make_defaults = cbk_cg.make_defaults + +# Sorry, getspec, you're not a real fop until someone writes a stub function +# for you. +del fop_cg.decls["getspec"] +del cbk_cg.decls["getspec"] + +# cbk is used by both fop and continue, so emit first +for f_name in cbk_cg.decls.keys(): + cbk_cg.emit(f_name,"cbk") + print("") + +# continue is used by fop, so emit next +for f_name in fop_cg.decls.keys(): + fop_cg.emit(f_name,"cont-func") + print("") + +for f_name in fop_cg.decls.keys(): + fop_cg.emit(f_name,"fop") + print("") diff --git a/xlators/cluster/nsr-client/src/nsrc.c b/xlators/cluster/nsr-client/src/nsrc.c new file mode 100644 index 000000000..4551a1432 --- /dev/null +++ b/xlators/cluster/nsr-client/src/nsrc.c @@ -0,0 +1,243 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "timer.h" +#include "xlator.h" + +#define SCAR_LIMIT 20 +#define HILITE(x) ("[1;33m"x"[0m") + +/* + * The fops are actually generated by gen-fops.py; the rest was mostly copied + * from defaults.c (commit cd253754 on 27 August 2013). + */ + +enum gf_dht_mem_types_ { + gf_mt_nsrc_private_t = gf_common_mt_end + 1, + gf_mt_nsrc_end +}; + +typedef struct { + xlator_t *active; +} nsrc_private_t; + +typedef struct { + call_stub_t *stub; + xlator_t *curr_xl; + uint16_t scars; +} nsrc_local_t; + +char *NSRC_XATTR = "user.nsr.active"; + +static inline +xlator_t * +ACTIVE_CHILD (xlator_t *parent) +{ + nsrc_private_t *priv = parent->private; + + return priv ? priv->active : FIRST_CHILD(parent); +} + +xlator_t * +next_xlator (xlator_t *this, xlator_t *prev) +{ + xlator_list_t *trav; + + for (trav = this->children; trav; trav = trav->next) { + if (trav->xlator == prev) { + return trav->next ? trav->next->xlator + : this->children->xlator; + } + } + + return NULL; +} + +void +nsrc_retry_cb (void *cb_arg) +{ + nsrc_local_t *local = cb_arg; + + gf_log (__func__, GF_LOG_INFO, HILITE("retrying %p"), local); + call_resume_wind(local->stub); +} + +#include "nsrc-cg.c" + +int32_t +nsrc_forget (xlator_t *this, inode_t *inode) +{ + gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not " + "implement forget_cbk"); + return 0; +} + + +int32_t +nsrc_releasedir (xlator_t *this, fd_t *fd) +{ + gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not " + "implement releasedir_cbk"); + return 0; +} + +int32_t +nsrc_release (xlator_t *this, fd_t *fd) +{ + gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not " + "implement release_cbk"); + return 0; +} + +struct xlator_fops fops = { + .lookup = nsrc_lookup, + .stat = nsrc_stat, + .fstat = nsrc_fstat, + .truncate = nsrc_truncate, + .ftruncate = nsrc_ftruncate, + .access = nsrc_access, + .readlink = nsrc_readlink, + .mknod = nsrc_mknod, + .mkdir = nsrc_mkdir, + .unlink = nsrc_unlink, + .rmdir = nsrc_rmdir, + .symlink = nsrc_symlink, + .rename = nsrc_rename, + .link = nsrc_link, + .create = nsrc_create, + .open = nsrc_open, + .readv = nsrc_readv, + .writev = nsrc_writev, + .flush = nsrc_flush, + .fsync = nsrc_fsync, + .opendir = nsrc_opendir, + .readdir = nsrc_readdir, + .readdirp = nsrc_readdirp, + .fsyncdir = nsrc_fsyncdir, + .statfs = nsrc_statfs, + .setxattr = nsrc_setxattr, + .getxattr = nsrc_getxattr, + .fsetxattr = nsrc_fsetxattr, + .fgetxattr = nsrc_fgetxattr, + .removexattr = nsrc_removexattr, + .fremovexattr = nsrc_fremovexattr, + .lk = nsrc_lk, + .inodelk = nsrc_inodelk, + .finodelk = nsrc_finodelk, + .entrylk = nsrc_entrylk, + .fentrylk = nsrc_fentrylk, + .rchecksum = nsrc_rchecksum, + .xattrop = nsrc_xattrop, + .fxattrop = nsrc_fxattrop, + .setattr = nsrc_setattr, + .fsetattr = nsrc_fsetattr, + .fallocate = nsrc_fallocate, + .discard = nsrc_discard, +}; + +struct xlator_cbks cbks = { +}; + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("nsrc", this, out); + + ret = xlator_mem_acct_init (this, gf_mt_nsrc_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Memory accounting init" "failed"); + return ret; + } +out: + return ret; +} + + +int32_t +nsrc_init (xlator_t *this) +{ + nsrc_private_t *priv = NULL; + + this->local_pool = mem_pool_new (nsrc_local_t, 128); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create nsrc_local_t pool"); + goto err; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_mt_nsrc_private_t); + if (!priv) { + goto err; + } + + priv->active = FIRST_CHILD(this); + this->private = priv; + return 0; + +err: + if (priv) { + GF_FREE(priv); + } + return -1; +} + +void +nsrc_fini (xlator_t *this) +{ + GF_FREE(this->private); +} + +int32_t +nsrc_notify (xlator_t *this, int32_t event, void *data, ...) +{ + int32_t ret = 0; + + switch (event) { + case GF_EVENT_CHILD_DOWN: + /* + * TBD: handle this properly + * + * What we really should do is propagate this only if it caused + * us to lose quorum, and likewise for GF_EVENT_CHILD_UP only + * if it caused us to gain quorum. However, that requires + * tracking child states and for now it's easier to swallow + * these unconditionally. The consequence of failing to do + * this is that DHT sees the first GF_EVENT_CHILD_DOWN and gets + * confused, so it doesn't call us and doesn't get up-to-date + * directory listings etc. + */ + break; + default: + ret = default_notify (this, event, data); + } + + return ret; +} + +class_methods_t class_methods = { + .init = nsrc_init, + .fini = nsrc_fini, + .notify = nsrc_notify, +}; + +struct volume_options options[] = { + { .key = {NULL} }, +}; diff --git a/xlators/cluster/nsr-recon/Makefile.am b/xlators/cluster/nsr-recon/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/nsr-recon/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/nsr-recon/src/Makefile.am b/xlators/cluster/nsr-recon/src/Makefile.am new file mode 100644 index 000000000..e639e4437 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/Makefile.am @@ -0,0 +1,23 @@ +xlator_LTLIBRARIES = nsr_recon.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +nsr_recon_la_LDFLAGS = -module -avoid-version +nsr_recon_la_SOURCES = recon_driver.c recon_xlator.c + +nsr_recon_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/api/src/libgfapi.la + +noinst_HEADERS = recon_driver.h recon_xlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/nsr.so diff --git a/xlators/cluster/nsr-recon/src/recon_driver.c b/xlators/cluster/nsr-recon/src/recon_driver.c new file mode 100644 index 000000000..8c7622a02 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/recon_driver.c @@ -0,0 +1,3130 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/types.h> +#include <fcntl.h> +#include <string.h> +#include <unistd.h> +#include <fnmatch.h> + + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" + + +#include "recon_driver.h" +#include "recon_xlator.h" +#include "api/src/glfs-internal.h" +#include "api/src/glfs-handles.h" + +/* TBD: move declarations here and nsr.c into a common place */ +#define NSR_TERM_XATTR "trusted.nsr.term" +#define RECON_TERM_XATTR "trusted.nsr.recon-term" +#define RECON_INDEX_XATTR "trusted.nsr.recon-index" + +/* + * Execution architecture for the NSR reconciliation driver. The driver runs + * as a seperate process in each node where the brick is. The main function of + * the driver is nsr_reconciliation_driver() (last function below) The driver + * just sits in a tight loop waiting for state changes. When a brick becomes a + * replica leader, it fences IO, contacts this process and waits for + * reconciliation to finish. + * + * The replica leader talks to other bricks in replica group which are alive + * and gets the last term info using which it decides which has the latest + * data. That brick is referred to as the "reconciliator"; leader sends a + * message to reconciliator to freeze its data (by reading any incomplete data + * from other nodes from that term if required) + * + * Once that is done leader sends a message to all nodes except the + * reconciliator to sync themselves with the reconciliator. This process is + * referred to as "resolution". + * + * Hence the reconciliation processes need to talk to each other to get a given + * term info. This is implemented using the recon translator IOs which + * implements a bare bone RPC by exposing a file interface to which + * reads/writes are done to pass control messages. This is referred to as the + * "control plane". This implementation allows the control plane to be + * implemented as a bunch of threads for each of the nodes. + * + * The reconciliation process also needs to talk to the brick process on that + * node to actually write the data as part of reconciliation/resolution. This + * is referred to as the "data plane". Again there are a bunch of threads that + * do this work. + * + * The way the worker threads are organised is that main driver context has a + * pointer to contexts for each of these thread contexts. The thread context at + * index 0 always refers to talking with local recon process/brick. So the + * control worker at index 0 will get the local changelog info and data worker + * at index 0 will talk to local brick. + * + * All the ops from the control/data planes are implemented using the glfs + * APIs. + */ + +#if defined(NSR_DEBUG) + +/* This lets us change on the fly even if NSR_DEBUG is defined. */ +int nsr_debug_level = GF_LOG_TRACE; + +FILE * +recon_create_log (char *member, char *module) +{ + char *dpath = NULL; + char *p; + char *fpath = NULL; + FILE *fp = NULL; + int fd = -1; + + (void)mkdir(NSR_LOG_DIR,0777); + (void)asprintf(&dpath,NSR_LOG_DIR"/%s",member); + if (dpath) { + for (p = dpath + strlen(NSR_LOG_DIR) + 1; *p; ++p) { + if (*p == '/') { + *p = '-'; + } + } + (void)mkdir(dpath,0777); + (void)asprintf(&fpath,"%s/%s",dpath,module); + if (fpath) { + fd = open(fpath,O_WRONLY|O_CREAT|O_APPEND|O_SYNC,0666); + if (fd >= 0) { + fp = fdopen(fd,"a"); + if (!fp) { + close(fd); + } + } + if (fp) { + if (setvbuf (fp, NULL, _IONBF, 0)) { + /* + * Might as well take advantage of it + * to log the error. + */ + fprintf (fp, + "setvbuf failed for log\n"); + fprintf (fp, + "log output may be async\n"); + fflush(fp); + } + } + free(fpath); + } + free(dpath); + } + + return fp; +} + +void +_nsr_driver_log (const char *func, int line, char *member, FILE *fp, + char *fmt, ...) +{ + va_list ap; + char *buf = NULL; + int retval; + + if (!fp) { + fp = recon_create_log(member,"nsr-driver-log"); + if (!fp) { + return; + } + } + + va_start(ap,fmt); + retval = vasprintf(&buf,fmt,ap); + if (buf) { + fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf); + free(buf); + } + va_end(ap); +} + +void +_nsr_worker_log (const char *func, int line, char *member, + char *type, uint32_t index, FILE *fp, + char *fmt, ...) +{ + va_list ap; + char *buf = NULL; + int retval; + + if (!fp) { + char *name; + if (asprintf(&name,"%s-%u",type,index) < 1) { + return; + } + fp = recon_create_log (member, name); + if (!fp) { + return; + } + } + + va_start(ap,fmt); + retval = vasprintf(&buf,fmt,ap); + if (buf) { + fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf); + free(buf); + } + va_end(ap); +} + +#endif + +/* + * Recon Driver Calloc + * + * We need this because all of this messing about with gfapi from within a + * translator keeps scrambling THIS (only one reason it's a terrible idea) and + * we need THIS to have a value that represents our initialization with our + * memory types. + * + * Note that the macro requires "this" to be defined in the current scope. + */ + +#define RD_CALLOC(x,y,z) ({THIS = this; GF_CALLOC(x,y,z); }) + +/* + * This function gets the size of all the extended attributes for a file. + * This is used so that caller knows how much to allocate for key-value storage. + * + * Input Arguments: + * fd - the file opened using glfs API. + * dict - passed so that NSR translator can get this from the required brick + * + * Output Arguments: + * b - pointer to the buffer where the attributes are filled up. + * key_size - the size of all keys + * val_size - the size of all values + * num - number of key/values + */ +static int32_t +get_xattr_total_size( struct glfs_fd *fd, + char **b, + uint32_t *key_size, + uint32_t *val_size, + uint32_t* num, + dict_t *dict) +{ + int32_t s = -1, ret = -1; + char *c = NULL; + + *key_size = 0; + *val_size = 0; + *num = 0; + + // First get the size of the keys + s = glfs_flistxattr_with_xdata(fd, NULL,0, dict); + if (s == -1) { + goto out; + } + *key_size = s; + + // TBD - use the regular calloc + (*b) = c = calloc(s+1,1); + + // get the keys themselves + if (glfs_flistxattr_with_xdata(fd, c, s+1, dict) == -1) { + goto out; + } + do { + int32_t r; + uint32_t len = 0; + // for each key get the size of the value + r = glfs_fgetxattr_with_xdata(fd, c, NULL, 0, dict); + if (r == -1) + goto out; + (*val_size) += r; + len = strlen(c) + 1; + c += len; + s -= len; + (*num)++; + } while(s); + ret = 0; +out: + return ret; +} + +/* + * This function gets bunch of xattr values given set of keys. + * + * Input Arguments: + * fd - the file opened using glfs API. + * keys - the bunch of keys + * size - size of values + * num - number of keys + * dict - passed so that NSR translator can get this from the required brick + * + * Output Arguments: + * buf - where the values are written one after the other (NULL seperated) + */ +static int32_t +get_xattr(struct glfs_fd *fd, + char *keys, + char *buf, + uint32_t size, + uint32_t num, + dict_t *dict) +{ + while(num--) { + int32_t r; + uint32_t len = 0; + + // copy the key + strcpy(buf, keys); + len = strlen(keys); + len++; + buf += len; + + // get the value and copy the value after incrementing buf after the key + r = glfs_fgetxattr_with_xdata(fd, keys, buf, size, dict); + + // TBD - handle error + if (r == -1) + return -1; + + // increment the key to next value + keys += len; + + // increment buf to hold the next key + buf += strlen(buf) + 1; + } + return 0; +} + +/* + * Function deletes a bunch of key values in extended attributes of a file. + * Input Arguments: + * fd - the file opened using glfs API. + * dict - passed so that NSR translator can do this from the required brick + * keys - bunch of NULL seperated key names + * num - number of keys + */ +static int32_t delete_xattr(struct glfs_fd *fd, + dict_t *dict_t, + char *keys, + uint32_t num) +{ + while(num--) { + // get the value and copy the value + // TBD - handle failure cases when calling glfs_fremovexattr_with_xdata() + if (glfs_fremovexattr_with_xdata(fd, keys, dict_t) == -1) + return -1; + keys += strlen(keys) +1; + } + return 0; +} + +/* + * Given a bunch of key value pairs, fill them as xattrs for a file + * + * Input Arguments: + * fd - the file opened using glfs API. + * dict - passed so that NSR translator can do this from the required brick + * buf - buffer containing the keys-values pairs. The key value are NULL seperated. + * Each of the key-value is seperated by NULL in turn. + * num - Number of such key value pairs. + */ +static int32_t +fill_xattr(struct glfs_fd *fd, + dict_t *dict, + char *buf, + uint32_t num) +{ + char *k = buf, *val = NULL; + + while(num--) { + int32_t r; + + val = k + strlen(k) + 1; + + // TBD - handle failure cases when calling glfs_fsetxattr_with_xdata() + r = glfs_fsetxattr_with_xdata(fd, k, val, strlen(val), 0, dict); + if (r == -1) + return -1; + k = val + strlen(val) + 1; + } + return 0; +} + +/* + * This function gets a file that can be used for doing glfs_init later. + * The control file is used by control thread(function) to talk to peer reconciliation process. + * The data file is used by the data thread(function) to talk to the bricks. + * The control file is of name such as con:gfs1:-mnt-a1 where "gfs1" is name of host + * and the brick path is "/mnt/a1". + * The data file is of name such as data:gfs1:-mnt-a1. + * + * Input Arguments: + * vol - name of the volume. This is used to build the full path of the control and data file + * such as /var/lib/glusterd/vols/test/bricks/gfs2:-mnt-test1-nsr-recon.vol. + * In above example the volume name is test and brick on gfs2 is on path /mnt/test1 + * + * worker - The worker for a given node. This worker has 2 threads - one on the data plane + * and one on the control plane. The worker->name is already filled with hostname:brickname + * in the function nsr_reconciliation_driver(). Use that to build the volume file. + * So if worker->name has gfs1:/mnt/a1, control file is con:gfs1:-mnt-a1 + * and data file is data:gfs1:-mnt-a1. + * All these files are under the bricks directory. TBD - move this to a NSR recon directory later. + */ +static void +nsr_recon_get_file(char *vol, nsr_replica_worker_t *worker) +{ + char *ptr; + char tr[256]; + + // Replace the "/" to - + strcpy(tr, worker->name); + ptr = strchr (tr, '/'); + while (ptr) { + *ptr = '-'; + ptr = strchr (tr, '/'); + } + + // Build the base directory such as "/var/lib/glusterd/vols/test/bricks/" + sprintf(worker->control_worker->vol_file, + "/%s/%s/%s/%s/", + GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + vol, + GLUSTERD_BRICK_INFO_DIR); + + strcat(worker->control_worker->vol_file, "con:"); + strcat(worker->control_worker->vol_file, tr); + + sprintf(worker->data_worker->vol_file, + "/%s/%s/%s/%s/", + GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + vol, + GLUSTERD_BRICK_INFO_DIR); + strcat(worker->data_worker->vol_file, "data:"); + strcat(worker->data_worker->vol_file, tr); +} + +/* + * This function does all the glfs initialisation + * so that reconciliation process can talk to other recon processes/bricks + * for the control/data messages. + * This will be done everytime a worker needs to be kicked off to talk + * across any plane. + * + * Input arguments: + * ctx - The per worker based context + * control - set to true if this worker is for the control plane + */ +static int32_t +nsr_recon_start_work(nsr_per_node_worker_t *ctx, + gf_boolean_t control) +{ + glfs_t *fs = NULL; + xlator_t *this = ctx->driver_ctx->this; + int32_t ret = 0; + glfs_fd_t *aux_fd = NULL; // fd of auxilary log + char lf[256]; + nsr_recon_private_t *priv = NULL; + char *my_name = NULL; + char *morph_name = NULL, *ptr = NULL; + + priv = this->private; + my_name = RD_CALLOC (1, + strlen (priv->replica_group_members[0]) + 1, + gf_mt_recon_member_name_t); + strcpy (my_name, priv->replica_group_members[0]); + + nsr_worker_log(this->name, GF_LOG_INFO, + "starting work with volfile %s\n", + ctx->vol_file); + + fs = glfs_new(ctx->id); + if (!fs) { + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_ERROR, + "cannot create gfls context for thread %s\n",ctx->id); + return -1; + } + + // For some vague reason, glfs init APIs seem to be clobbering "this". hence resetting it. + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_INFO, + "init done. setting volfile %s\n", + ctx->vol_file); + + ret = glfs_set_volfile(fs, ctx->vol_file); + if (ret != 0) { + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_ERROR, + "cannot set volfile %s for thread %s\n",ctx->vol_file, ctx->id); + return -1; + } + + morph_name = RD_CALLOC (1, strlen (my_name) + 1, + gf_mt_recon_member_name_t); + strcpy (morph_name, my_name); + + ptr = strchr (morph_name, '/'); + while (ptr) + { + *ptr = '-'; + ptr = strchr (morph_name, '/'); + } + // TBD - convert this to right /usr/local/var/log based log files. + + sprintf(lf, NSR_LOG_DIR"/%s/%s-%"PRIu32, morph_name, + (control == _gf_true)?"glfs-con":"glfs-data", ctx->index); + ret = glfs_set_logging (fs, lf, 7); + if (ret) { + glusterfs_this_set(this); + gf_log (this->name, GF_LOG_ERROR, "glfs logging set failed (%s)", + strerror (errno)); + return -1; + } + + ret = glfs_init (fs); + if (ret != 0) { + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do init for thread %s with volfile %s\n",ctx->id, ctx->vol_file); + return -1; + } + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_INFO, + "setting volfile %s done\n", + ctx->vol_file); + + // If it is control thread, open the "/" as the aux_fd. + // All IOs happening via the fd will do the RPCs across the reconciliation + // processes. For some vague reason, the root seems to be open'able like a file. + // TBD - try to clean this up. (implement a virtual file???) + if (control == _gf_true) { + nsr_worker_log(this->name, GF_LOG_INFO, + "doing open for / \n"); + aux_fd = glfs_open (fs, "/", O_RDWR); + // TBD - proper error handling. Stall reconciliation if such a thing happens? + if (aux_fd == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "cannot open aux log file for thread %s\n",ctx->id); + return -1; + } else { + nsr_worker_log(this->name, GF_LOG_ERROR, + "---opened aux log file for thread %s\n",ctx->id); + } + ctx->aux_fd = aux_fd; + } + glusterfs_this_set(this); + ctx->fs = fs; + return 0; +} + +/* + * + * This function does the cleanup after reconciliation is done + * or before we start a new reconciliation. + * + * Input arguments: + * ctx - The per worker based context + * control - set to true if this worker is for the control plane + */ +static int32_t +nsr_recon_end_work(nsr_per_node_worker_t *ctx, + gf_boolean_t control) +{ + int32_t ret = 0; + xlator_t *this = ctx->driver_ctx->this; + + nsr_worker_log(this->name, GF_LOG_INFO, + "doing fini for recon worker\n"); + + ret = glfs_fini(ctx->fs); + if (ret != 0) { + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do fini for thread %s with volfile %s\n",ctx->id, ctx->vol_file); + return -1; + } + glusterfs_this_set(this); + ctx->fs = NULL; + if (control == _gf_true) { + glfs_close (ctx->aux_fd); + ctx->aux_fd = NULL; + } + return 0; +} + +//called in case all worker functions run as sepeerate threads +static void +init_worker(nsr_per_node_worker_t *ctx, gf_boolean_t control) +{ + pthread_mutex_init(&(ctx->mutex), NULL); + pthread_cond_init(&(ctx->cv), NULL); + INIT_LIST_HEAD(&(ctx->head.list)); +} + + +/* + * Control worker funct for getting changelog info on this node. + * calls directly functions to parse the changelog. + * + * Input arguments: + * ctx - The per worker based context + * control - set to true if this worker is for the control plane + */ +static void +control_worker_func_0(nsr_per_node_worker_t *ctx, + nsr_recon_work_t *work) +{ + unsigned int index = ctx->index; + nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]); + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + xlator_t *this = dr->this; + nsr_recon_private_t *priv = this->private; + + ctx->is_control = _gf_true; + + switch (work->req_id) { + case NSR_WORK_ID_INI: + { + break; + } + case NSR_WORK_ID_FINI: + { + break; + } + case NSR_WORK_ID_GET_LAST_TERM_INFO: + { + nsr_recon_last_term_info_t lt; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + // term is stuffed inside work->index. overloading. + int32_t term = work->index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get last term info for node %d with current term %d\n",index, term); + + // TBD - handle errors + // This is called by the leader after it gets the current term. + // Makes searching easier. + nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, term, <); + recon_info->last_term = lt.last_term; + recon_info->commited_ops = lt.commited_ops; + recon_info->last_index = lt.last_index; + recon_info->first_index = lt.first_index; + + + nsr_worker_log(this->name, GF_LOG_INFO, + "out of get last term info with current term %d. got ops %d with first %d and last %d \n", + recon_info->last_term, recon_info->commited_ops, + recon_info->first_index, recon_info->last_index); + break; + } + case NSR_WORK_ID_GET_GIVEN_TERM_INFO: + { + nsr_recon_last_term_info_t lt; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + // term is stuffed inside work->index. overloading. + int32_t term = work->index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get term info for node %d for term %d\n",index, term); + + // TBD - handle errors + nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, term, <); + + recon_info->last_term = lt.last_term; + recon_info->commited_ops = lt.commited_ops; + recon_info->last_index = lt.last_index; + recon_info->first_index = lt.first_index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "out of get term info for term %d. got ops %d with first %d and last %d \n", + recon_info->last_term, recon_info->commited_ops, + recon_info->first_index, recon_info->last_index); + + break; + } + case NSR_WORK_ID_RECONCILIATOR_DO_WORK: + { + // For local resolution, the main driver thread does it. + // SO there is no way we can have this message for this node. + + nsr_worker_log(this->name, GF_LOG_INFO, + "this message should not be sent \n"); + break; + } + case NSR_WORK_ID_RESOLUTION_DO_WORK: + { + + nsr_worker_log(this->name, GF_LOG_INFO, + "this message should not be sent \n"); + ctx->result = -1; + break; + } + case NSR_WORK_ID_GET_RECONCILATION_WINDOW: + { + nsr_reconciliator_info_t *recon_info = rw->recon_info; + // first_index and last_index at 0 indicates empty log. + // For non empty log, the first_index always starts at 1. + uint32_t num = (dr->workers[index].recon_info->last_index - + dr->workers[index].recon_info->first_index + 1); + nsr_recon_record_details_t *rd; + uint32_t i=0; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get reconciliation window records for node %d for term %d with first %d last %d\n", + index, recon_info->last_term, recon_info->first_index, recon_info->last_index); + + + // TBD - handle buffer allocation errors + rd = RD_CALLOC(num, + sizeof(nsr_recon_record_details_t), + gf_mt_recon_record_details_t); + if (rd == NULL) { + ctx->result = -1; + return; + } + + recon_info->records = RD_CALLOC(num, + sizeof(nsr_reconciliation_record_t), + gf_mt_recon_record_t); + if (recon_info->records == NULL) { + ctx->result = -1; + return; + } + + // TBD - handle errors + if (nsr_recon_libchangelog_get_records(this, priv->changelog_base_path, + recon_info->last_term, + recon_info->first_index, + recon_info->last_index, + rd) == _gf_false) { + ctx->result = -1; + return; + } + + // The above function writes into rd from 0 to (num -1) + // We need to take care of this whenever we deal with records + for (i=0; i < num; i++) { + ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl + memcpy(&(recon_info->records[i].rec), + &(rd[i]), + sizeof(nsr_recon_record_details_t)); + } + + GF_FREE(rd); + + nsr_worker_log(this->name, GF_LOG_INFO, + "got reconciliation window records for node %d for term %d \n", + index, recon_info->last_term); + break; + } + + default: + nsr_worker_log (this->name, GF_LOG_ERROR, + "bad req id %u", work->req_id); + } + + return; +} + +// Control worker thread +static void* +control_worker_main_0(nsr_per_node_worker_t *ctx) +{ + + ctx->is_control = _gf_true; + nsr_worker_log(this->name, GF_LOG_INFO, + "starting control worker func 0\n"); + + init_worker(ctx, 1); + + while(1) + { + nsr_recon_work_t *work = NULL; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + + nsr_worker_log(this->name, GF_LOG_INFO, + "waiting for work\n"); + + pthread_mutex_lock(&ctx->mutex); + while (list_empty(&(ctx->head.list))) { + pthread_cond_wait(&ctx->cv, &ctx->mutex); + } + pthread_mutex_unlock(&ctx->mutex); + + + list_for_each_entry(work, &(ctx->head.list), list) { + nsr_worker_log(this->name, GF_LOG_INFO, + "got work with id %d\n", work->req_id); + work->in_use = _gf_false; + + // Call the main function. + control_worker_func_0(ctx, work); + + atomic_dec(&(dr->outstanding)); + break; + } + + nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n"); + list_del_init (&work->list); + GF_FREE(work); + nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n"); + } + + return NULL; +} + +static void +control_worker_do_reconciliation (nsr_per_node_worker_t *ctx, + nsr_recon_work_t *work) +{ + unsigned int index = ctx->index; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + nsr_recon_role_t rr; + uint32_t i=0; + uint32_t num=0; + uint32_t idx = dr->reconciliator_index; + uint32_t term = dr->workers[idx].recon_info->last_term; + + GF_ASSERT(idx == index); + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to make this index %d as reconciliator for term %d\n", index, term); + + // TBD - error handling for all the glfs APIs + if (glfs_lseek(ctx->aux_fd, + nsr_recon_xlator_sector_1, + SEEK_SET) == -1) { + ctx->result = -1; + return; + } + + // We have all the info for all other nodes. + // Fill all that info when sending data to that process. + for (i=0; i < dr->replica_group_size; i++) { + if ( dr->workers[i].in_use && + (dr->workers[i].recon_info->last_term == term)) { + rr.info[num].last_term = + dr->workers[i].recon_info->last_term; + rr.info[num].commited_ops = + dr->workers[i].recon_info->commited_ops; + rr.info[num].last_index = + dr->workers[i].recon_info->last_index; + rr.info[num].first_index = + dr->workers[i].recon_info->first_index; + strcpy(rr.info[num].name, + dr->workers[i].name); + } + num++; + } + rr.num = num; + rr.role = reconciliator; + ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl + if (glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0) == -1) { + ctx->result = -1; + // Put the errno only for this case since we are bothered about + // retrying only for this case. For rest of the cases we will + // just return EIO in errno. + ctx->op_errno = errno; + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "sent reconciliator info for term %d with node count as %d\n", term, num); +} + +static void +control_worker_do_resolution (nsr_per_node_worker_t *ctx, + nsr_recon_work_t *work) +{ + unsigned int index = ctx->index; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + nsr_recon_role_t rr; + unsigned int i=0, j=0; + unsigned int rec = dr->reconciliator_index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to make this index %d as resolutor with reconciliator as %d\n",index, rec); + + // TBD - error handling for all the glfs APIs + if (glfs_lseek(ctx->aux_fd, + nsr_recon_xlator_sector_1, + SEEK_SET) == -1) { + ctx->result = -1; + return; + } + + rr.num = 2; + + // Fill in info[0] as info for the node for which we are seeking + // resolution. Fill in info[1] as info of the reconciliator node. The + // function nsr_recon_driver_get_role() that will be called when this + // message reaches the node will look at index 1 for term information + // related to the reconciliator. + for (i=0; i < 2; i++) { + (i == 0) ? (j = index) : (j = rec); + rr.info[i].last_term = + dr->workers[j].recon_info->last_term; + rr.info[i].commited_ops = + dr->workers[j].recon_info->commited_ops; + rr.info[i].last_index = + dr->workers[j].recon_info->last_index; + rr.info[i].first_index = + dr->workers[j].recon_info->first_index; + // The name is used as the key to convert indices since the + // reconciliator index could be different across the nodes. + strcpy(rr.info[i].name, + dr->workers[j].name); + if (i == 0) { + nsr_worker_log(this->name, GF_LOG_INFO, + "this node info term=%d, ops=%d, first=%d, last=%d\n", + rr.info[i].last_term, rr.info[i].commited_ops, + rr.info[i].first_index,rr.info[i].last_index); + } else { + nsr_worker_log(this->name, GF_LOG_INFO, + "reconciliator node info term=%d, ops=%d, first=%d, last=%d\n", + rr.info[i].last_term, rr.info[i].commited_ops, + rr.info[i].first_index,rr.info[i].last_index); + } + } + rr.role = resolutor; + ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl + if (glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0) == -1) { + ctx->result = -1; + // Put the errno only for this case since we are bothered about + // retrying only for this case. For rest of the cases we will + // just return EIO in errno. + ctx->op_errno = errno; + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "sent message to this node %d resolutor with reconciliator as %d\n", index, rec); +} + +static void +control_worker_get_window (nsr_per_node_worker_t *ctx, nsr_recon_work_t *work) +{ + unsigned int index = ctx->index; + nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]); + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + xlator_t *this = dr->this; + nsr_recon_log_info_t li; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + uint32_t i = 0; + uint32_t num = (dr->workers[index].recon_info->last_index - + dr->workers[index].recon_info->first_index +1); + nsr_recon_record_details_t *rd; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get reconciliation window records for node %d for term %d with first %d last %d\n", + index, recon_info->last_term, recon_info->first_index, recon_info->last_index); + + // TBD - error handling for all the glfs APIs + if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_2, SEEK_SET) == -1) { + ctx->result = -1; + return; + } + + // write to node what term & indices we are interested + li.term = recon_info->last_term; + li.first_index = recon_info->first_index; + li.last_index = recon_info->last_index; + ENDIAN_CONVERSION_LI(li, _gf_false); //htonl + if (glfs_write(ctx->aux_fd, &li, sizeof(li), 0) == -1) { + ctx->result = -1; + return; + } + + // then read + rd = RD_CALLOC(num, + sizeof(nsr_recon_record_details_t), + gf_mt_recon_private_t); + if (rd == NULL) { + ctx->result = -1; + return; + } + recon_info->records = RD_CALLOC(num, + sizeof(nsr_reconciliation_record_t), + gf_mt_recon_private_t); + if (recon_info->records == NULL) { + ctx->result = -1; + goto err; + } + + if (glfs_read(ctx->aux_fd, rd, num * sizeof(nsr_recon_record_details_t), 0) == -1) { + ctx->result = -1; + goto err; + } + + for (i=0; i < num; i++) { + ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl + memcpy (&(recon_info->records[i].rec), &(rd[i]), + sizeof(nsr_recon_record_details_t)); + nsr_worker_log(this->name, GF_LOG_INFO, + "get_reconcilaition_window:Got %d at index %d\n", + recon_info->records[i].rec.type, + i + recon_info->first_index); + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "got reconciliation window records for node %d for term %d \n", + index, recon_info->last_term); + +err: + GF_FREE(rd); +} + +/* + * Control worker funct for getting changelog info on some other node. + * calls glfs functions to seek/read/write on aux_fd. + * + * Input arguments: + * ctx - The per worker based context + * control - set to true if this worker is for the control plane + */ +static void +control_worker_func(nsr_per_node_worker_t *ctx, + nsr_recon_work_t *work) +{ + unsigned int index = ctx->index; + nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]); + nsr_recon_last_term_info_t lt; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + int32_t term = htonl(work->index); // overloading it + + ctx->is_control = _gf_true; + + switch (work->req_id){ + + case NSR_WORK_ID_INI: + nsr_worker_log(this->name, GF_LOG_INFO, + "calling nsr_recon_start_work\n"); + + // TBD - handle error in case nsr_recon_start_work gives error + if (nsr_recon_start_work(ctx, _gf_true) != 0) { + ctx->result = -1; + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished nsr_recon_start_work\n"); + break; + + case NSR_WORK_ID_FINI: + nsr_worker_log(this->name, GF_LOG_INFO, + "calling nsr_recon_end_work\n"); + + // TBD - handle error in case nsr_recon_end_work gives error + if (nsr_recon_end_work(ctx, _gf_true) != 0) { + ctx->result = -1; + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished nsr_recon_end_work\n"); + break; + + case NSR_WORK_ID_GET_LAST_TERM_INFO: + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get last term info for node %d with current term %d\n",index, work->index); + + // first write the current term term number + // TBD - error handling for all the glfs APIs + if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_4, SEEK_SET) == -1) { + ctx->result = -1; + return; + } + if (glfs_write(ctx->aux_fd, &term, sizeof(term), 0) == -1) { + ctx->result = -1; + return; + } + if (glfs_read(ctx->aux_fd, <, sizeof(lt), 0) == -1) { + ctx->result = -1; + return; + } + ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl + recon_info->last_term = lt.last_term; + recon_info->commited_ops = lt.commited_ops; + recon_info->last_index = lt.last_index; + recon_info->first_index = lt.first_index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "out of get last term info with current term %d. got ops %d with first %d and last %d \n", + recon_info->last_term, recon_info->commited_ops, + recon_info->first_index, recon_info->last_index); + + break; + + case NSR_WORK_ID_GET_GIVEN_TERM_INFO: + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get term info for node %d for term %d\n",index, work->index); + + // first write the term number + // TBD - error handling for all the glfs APIs + if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_3, SEEK_SET) == -1) { + ctx->result = -1; + return; + } + if (glfs_write(ctx->aux_fd, &term, sizeof(term), 0) == -1) { + ctx->result = -1; + return; + } + if (glfs_read(ctx->aux_fd, <, sizeof(lt), 0) == -1) { + ctx->result = -1; + return; + } + ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl + recon_info->last_term = lt.last_term; + recon_info->commited_ops = lt.commited_ops; + recon_info->last_index = lt.last_index; + recon_info->first_index = lt.first_index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "out of get term info for term %d. got ops %d with first %d and last %d \n", + recon_info->last_term, recon_info->commited_ops, + recon_info->first_index, recon_info->last_index); + + break; + + case NSR_WORK_ID_RECONCILIATOR_DO_WORK: + control_worker_do_reconciliation(ctx,work); + break; + + case NSR_WORK_ID_RESOLUTION_DO_WORK: + control_worker_do_resolution(ctx,work); + break; + + case NSR_WORK_ID_GET_RECONCILATION_WINDOW: + control_worker_get_window(ctx,work); + break; + + default: + nsr_worker_log (this->name, GF_LOG_ERROR, + "bad work type %d", work->req_id); + } + + return; +} + +// Control worker thread +static void* +control_worker_main(nsr_per_node_worker_t *ctx) +{ + unsigned int index = ctx->index; + + ctx->is_control = _gf_true; + nsr_worker_log(this->name, GF_LOG_INFO, + "starting control worker func\n"); + + // if this is for local processing, call the changelog parsing calls directly + if (index == 0) { + control_worker_main_0(ctx); + return NULL; + } + + init_worker(ctx, 1); + + + while(1) + { + nsr_recon_work_t *work = NULL; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + + nsr_worker_log(this->name, GF_LOG_INFO, + "waiting for work\n"); + + pthread_mutex_lock(&ctx->mutex); + while (list_empty(&(ctx->head.list))) { + pthread_cond_wait(&ctx->cv, &ctx->mutex); + } + pthread_mutex_unlock(&ctx->mutex); + + + list_for_each_entry(work, &(ctx->head.list), list) { + nsr_worker_log(this->name, GF_LOG_INFO, + "got work with id %d\n", work->req_id); + work->in_use = _gf_false; + control_worker_func(ctx,work); + atomic_dec(&(dr->outstanding)); + break; + } + nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n"); + list_del_init (&work->list); + GF_FREE(work); + nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n"); + } + + return NULL; +} + +/* + * This function gets called if this process is chosen as the reconciliator + * for this replica group. It would have already got the records for the last term + * for the indices that are required (from the first HOLE to last index) from + * all other nodes that also witnessed that term. COmpare all the records and + * compute the work required. + * + * Input arguments + * ctx - driver context. All recon work is stored in workers[0].recon_info + */ +static void +compute_reconciliation_work(nsr_recon_driver_ctx_t *ctx) +{ + uint32_t i=0, j=0; + nsr_reconciliator_info_t *my_recon = ctx->workers[0].recon_info; + uint32_t num = (my_recon->last_index - my_recon->first_index + 1); + + for (i=0; i < num; i++) { + nsr_log_type_t orig, new; + unsigned int src = 0; + orig = new = my_recon->records[i].rec.type; + nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE; + // index 0 means this node. Look at all other nodes. + for (j=1; j < ctx->replica_group_size; j++) { + if (ctx->workers[j].in_use) { + nsr_log_type_t pr = ctx->workers[j].recon_info->records[i].work.type; + if ((new != pr) && (pr > new)) { + src = j; + new = (new | pr); + } + } + } + // TBD - compare data if new and orig are all FILLs. (can detect changelog corruption) + // Right now we compare if both orig and new are psuedo holes since + // only that is of interest to us. + if (orig != new) { + if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_PSEUDO_HOLE)) + tw = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE; + else if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_FILL)) + tw = NSR_RECON_WORK_HOLE_TO_FILL; + else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_PSEUDO_HOLE)) + tw = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE; + else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_FILL)) + tw = NSR_RECON_WORK_HOLE_TO_FILL; + } + if (tw != NSR_RECON_WORK_NONE) { + my_recon->records[i].work.type = tw; + my_recon->records[i].work.source = src; + // Overwrite the record + memcpy(&(my_recon->records[i].rec), + &(ctx->workers[src].recon_info->records[i].rec), + sizeof(nsr_recon_record_details_t)); + } + } + return; +} + +static int32_t +nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx, + uint32_t i, + gf_boolean_t in_use); + +/* + * Write the role and associated information to the node. + * This gets called from recon xlator indicating node is either + * leader, reconciliator or should do resolution. + */ +gf_boolean_t +nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx, + nsr_recon_role_t *rr, + uint32_t term) +{ + nsr_role_work_t *rw; + xlator_t *this = ctx->this; + + nsr_driver_log(this->name, GF_LOG_INFO, "set role called \n"); + rw = RD_CALLOC(1, sizeof (nsr_role_work_t), gf_mt_recon_role_work_t); + memcpy(&rw->role, rr, sizeof(nsr_recon_role_t)); + rw->term = term; + INIT_LIST_HEAD(&(rw->list)); + pthread_mutex_lock(&(ctx->mutex)); + list_add_tail(&rw->list, &ctx->role_head.list); + pthread_cond_signal(&(ctx->cv)); + pthread_mutex_unlock(&(ctx->mutex)); + nsr_driver_log(this->name, GF_LOG_INFO, "set role returns \n"); + return _gf_true; +} + +/* + * First we undo the last role to make sure we clean up. + * + * Input arguments + * ctx - driver context. + * rr - Role information. + * If leader, the thread now sends the list of all nodes that are part of + * the current replica group. Use that to find out the activate the + * required worker threads. + * If reconciliator, the leader node would have sent information about + * all nodes which saw last term as the reconciliator. + * If resolution to be done, then rr.info[0] will have this node's info + * which the leader would have got earlier. rr[1].info will have the + * info regarding the reconciliator. + * term - leader's term that is causing this role + */ +nsr_recon_driver_state_t +nsr_recon_driver_get_role(int32_t *status, + nsr_recon_driver_ctx_t *ctx, + nsr_role_work_t *rw) +{ + uint8_t i=0, j=0; + nsr_recon_role_t *rr = &(rw->role); + nsr_reconciliator_info_t *tmp; + xlator_t *this = ctx->this; + + // First make all the threads uninitialise + for (i = 0; i < ctx->replica_group_size; i++) { + if (nsr_recon_in_use(ctx, i, _gf_false) == -1) { + *status = -1; + return 0; + } + } + + switch (rr->role) { + case leader: + case joiner: + + // First set info this node + tmp = RD_CALLOC (1, sizeof (nsr_reconciliator_info_t), + gf_mt_recon_reconciliator_info_t); + if (!tmp) { + *status = -1; + return 0; + } + ctx->workers[0].recon_info = tmp; + if (nsr_recon_in_use(ctx, 0, _gf_true) == -1) { + *status = -1; + return 0; + } + ctx->current_term = rr->current_term; + + // Find rest of the nodes + for (i=1; i < ctx->replica_group_size; i++) { + for (j=0 ; /* nothing */; j++) { + if (j >= rr->num) { + nsr_driver_log (this->name, GF_LOG_ERROR, + "failed to find %s", + ctx->workers[i].name); + break; + } + if (strcmp(ctx->workers[i].name, + rr->info[j].name)) { + continue; + } + nsr_driver_log (this->name, GF_LOG_INFO, + "nsr_recon_driver_get_role: this as %s. found other server %s\n", + (rr->role == leader) ? "leader" + : "joiner", + ctx->workers[i].name); + + // Allocate this here. This will get later + // filled when the leader tries to get last term + // information from all the nodes + tmp = RD_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_reconciliator_info_t); + if (!tmp) { + *status = -1; + return 0; + } + ctx->workers[i].recon_info = tmp; + if (nsr_recon_in_use(ctx, i, _gf_true) == -1) { + *status = -1; + return 0; + } + break; + } + } + // If leader, reconciliator has to be chosen. + // If joiner, we are the reconciliator. + if (rr->role == leader) + ctx->reconciliator_index = -1; + else + ctx->reconciliator_index = 0; + break; + + case reconciliator: + ctx->reconciliator_index = 0; + // Copy information about all the other members which had the + // same term + for (i=0; i < rr->num; i++) { + for (j=0; /* nothing */; j++) { + if (j >= ctx->replica_group_size) { + nsr_driver_log (this->name, GF_LOG_ERROR, + "failed to find %s", + rr->info[i].name); + break; + } + if (strcmp(rr->info[i].name, + ctx->workers[j].name)) { + continue; + } + nsr_driver_log(this->name, GF_LOG_INFO, + "nsr_recon_driver_get_role: this as reconciliator. found other server %s\n", + ctx->workers[j].name); + tmp = RD_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_reconciliator_info_t); + if (!tmp) { + *status = -1; + return 0; + } + tmp->last_term = rr->info[i].last_term; + tmp->commited_ops = rr->info[i].commited_ops; + tmp->last_index = rr->info[i].last_index; + tmp->first_index = rr->info[i].first_index; + ctx->workers[j].recon_info = tmp; + if (nsr_recon_in_use(ctx, j, _gf_true) == -1) { + *status = -1; + return 0; + } + break; + } + } + break; + + case resolutor: + for (j=0; /* nothing */; j++) { + // info[1] has the information regarding the + // reconciliator + if (j >= ctx->replica_group_size) { + nsr_driver_log (this->name, GF_LOG_ERROR, + "failed to find %s", + rr->info[1].name); + break; + } + if (strcmp(rr->info[1].name, + ctx->workers[j].name)) { + continue; + } + nsr_driver_log(this->name, GF_LOG_INFO, + "nsr_recon_driver_get_role: this as resolutor. found other server %s as reconciliator\n", + ctx->workers[j].name); + tmp = RD_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_reconciliator_info_t); + if (!tmp) { + *status = -1; + return 0; + } + tmp->last_term = rr->info[1].last_term; + tmp->commited_ops = rr->info[1].commited_ops; + tmp->last_index = rr->info[1].last_index; + tmp->first_index = rr->info[1].first_index; + ctx->reconciliator_index = j; + ctx->workers[j].recon_info = tmp; + if (nsr_recon_in_use(ctx, j, _gf_true) == -1) { + *status = -1; + return 0; + } + GF_ASSERT(ctx->reconciliator_index != 0); + break; + } + tmp = RD_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_reconciliator_info_t); + if (!tmp) { + *status = -1; + return 0; + } + // info[0] has all info for this node + tmp->last_term = rr->info[0].last_term; + tmp->commited_ops = rr->info[0].commited_ops; + tmp->last_index = rr->info[0].last_index; + tmp->first_index = rr->info[0].first_index; + ctx->workers[0].recon_info = tmp; + if (nsr_recon_in_use(ctx, 0, _gf_true) == -1) { + *status = -1; + return 0; + } + } + + ctx->term = rw->term; + + *status = 0; + return rr->role; +} + + +/* + * This function gets called if this process is chosen to sync itself with + * the reconciliator. + * + * Input arguments + * ctx - driver context. + * my_info - local changelog info that has all the local records for indices that require work + * his_info - reconciliator's info that has all the golden copies + * invalidate - if set to true, then do not consult local records + */ + +static void +compute_resolution_work(nsr_recon_driver_ctx_t *ctx, + nsr_reconciliator_info_t *my_info, + nsr_reconciliator_info_t *his_info, + gf_boolean_t invalidate) +{ + uint32_t i=0; + uint32_t num = (my_info->last_index - my_info->first_index + 1); + xlator_t *this = ctx->this; + + if (invalidate) { + if (my_info->records) { + GF_FREE(my_info->records); + } + my_info->records = RD_CALLOC(num, + sizeof(nsr_reconciliation_record_t), + gf_mt_recon_record_t); + } + + for (i=0; i < num; i++) { + nsr_log_type_t orig, new; + nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE; + orig = my_info->records[i].rec.type; + if (invalidate) + orig = NSR_LOG_HOLE; + new = his_info->records[i].rec.type; + // TBD - we can never have PSUEDO_HOLE in reconciliator's info + // We should have taken care of that during reconciliation. + // Put an assert to validate that. + if (new != orig) { + if ((orig != NSR_LOG_FILL) && (new == NSR_LOG_FILL)) + tw = NSR_RECON_WORK_HOLE_TO_FILL; + else if ((orig != NSR_LOG_HOLE) && (new == NSR_LOG_HOLE)) + tw = NSR_RECON_WORK_UNDO_FILL; + } + // copy the records anyway + my_info->records[i].work.type = tw; + my_info->records[i].work.source = ctx->reconciliator_index; + memcpy(&(my_info->records[i].rec), + &(his_info->records[i].rec), + sizeof(nsr_recon_record_details_t)); + } + return; +} + + +// Create an glfs object +static struct glfs_object * +create_obj(nsr_per_node_worker_t *ctx, char *gfid_str) +{ + struct glfs_object *obj = NULL; + uuid_t gfid; + + uuid_parse(gfid_str, gfid); + + obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL); + if (obj == NULL) { + GF_ASSERT(obj != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "creating of handle failed\n"); + return NULL; + } + return obj; +} + +/* + * Function to apply the actual record onto the local brick. + * prior to this we should have read all the data from the + * brick that has the data. + * + * Input parameters: + * ctx - per node worker context that has the fs for communicating to brick + * ri - Reconciliation record that needs fixup + * dict - So that NSR server translator on brick applis fixup only on this brick + * and the changelog translator consumes term and index. + */ + +static gf_boolean_t +apply_record(nsr_per_node_worker_t *ctx, + nsr_reconciliation_record_t *ri, + dict_t * dict) +{ + struct glfs_fd *fd = NULL; + struct glfs_object *obj = NULL; + struct glfs_object *to_obj = NULL; + gf_boolean_t retval = _gf_false; + + if (ri->rec.op == GF_FOP_WRITE) { + + nsr_worker_log(this->name, GF_LOG_INFO, + "DOing write for file %s @offset %d for len %d\n", + ri->rec.gfid, ri->rec.offset, ri->rec.len); + + // The file has got deleted on the source. Hence just ignore + // this. + // TBD - get a way to just stuff the log entry without writing + // the data so that changelogs remain identical. + if (ri->work.data == NULL) { + return _gf_true; + } + + if ((obj = create_obj(ctx,ri->rec.gfid)) == NULL) + goto err; + + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict); + if (fd == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "open for file %s failed\n", + ri->rec.gfid); + goto err; + } + if (glfs_lseek_with_xdata(fd, ri->rec.offset, SEEK_SET, dict) != ri->rec.offset) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "lseek for file %s failed at offset %d\n", + ri->rec.gfid, ri->rec.offset); + goto err; + } + if (glfs_write_with_xdata(fd, ri->work.data, ri->rec.len, 0, dict) != ri->rec.len) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "write for file %s failed for bytes %d\n", + ri->rec.gfid, ri->rec.len); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished DOing write for gfid %s @offset %d for len %d\n", + ri->rec.gfid, ri->rec.offset, ri->rec.len); + + } else if (ri->rec.op == GF_FOP_FTRUNCATE) { + + nsr_worker_log(this->name, GF_LOG_INFO, + "DOing truncate for file %s @offset %d \n", + ri->rec.gfid, ri->rec.offset); + + if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) { + goto err; + } + + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict); + if (fd == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "open for file %s failed\n", + ri->rec.gfid); + goto err; + } + if (glfs_ftruncate_with_xdata(fd, ri->rec.offset, dict) == -1) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "trunctae for file %s failed @offset %d\n", + ri->rec.gfid,ri->rec.offset ); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished DOing truncate for gfid %s @offset %d \n", + ri->rec.gfid, ri->rec.offset); + + } else if ((ri->rec.op == GF_FOP_FREMOVEXATTR) || + (ri->rec.op == GF_FOP_REMOVEXATTR) || + (ri->rec.op == GF_FOP_SETXATTR) || + (ri->rec.op == GF_FOP_FSETXATTR)) { + + uint32_t k_s = 0, v_s = 0; + char *t_b= NULL; + uint32_t num = 0; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing set extended attr for file %s \n", + ri->rec.gfid); + + // The file has got deleted on the source. Hence just ignore + // this. TBD - get a way to just stuff the log entry without + // writing the data so that changelogs remain identical. + if (ri->work.data == NULL) { + return _gf_true; + } + + if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) { + goto err; + } + + if (obj->inode->ia_type == IA_IFDIR) + fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict); + else + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict); + if (fd == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "open for file %s failed\n", + ri->rec.gfid); + goto err; + } + + if(get_xattr_total_size(fd, &t_b, &k_s, &v_s, &num, dict) == -1) { + if (t_b) free(t_b); + nsr_worker_log(this->name, GF_LOG_ERROR, + "list of xattr of %s failed\n", ri->rec.gfid); + goto err; + } + + if (delete_xattr(fd, dict, t_b, num) == -1) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "deleting xattrs failed\n"); + goto err; + } + + // Set one special dict flag to indicate the opcode so that + // the opcode gets set to this + if (dict_set_int32(dict,"recon-xattr-opcode",ri->rec.op)) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "setting opcode to %d failed\n",ri->rec.op); + goto err; + } + + if (fill_xattr(fd, dict, ri->work.data, ri->work.num) == -1) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "filling xattrs failed\n"); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finsihed Doing set extended attr for %s \n", + ri->rec.gfid); + + } else if (ri->rec.op == GF_FOP_CREATE) { + + uuid_t gfid; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing create for file %s \n", + ri->rec.gfid); + + // TBD - add mode and flags later + uuid_parse(ri->rec.gfid, gfid); + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) { + goto err; + } + + nsr_worker_log (this->name, GF_LOG_INFO, + "creating with mode 0%o", ri->rec.mode); + if (glfs_h_creat_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, ri->rec.mode, NULL, gfid, dict) == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failure for Doing create for file %s\n", + ri->rec.entry); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing create for file %s \n", + ri->rec.entry); + + } else if (ri->rec.op == GF_FOP_MKNOD) { + + uuid_t gfid; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing mknod for file %s \n", + ri->rec.entry); + + // TBD - add mode and flags later + uuid_parse(ri->rec.gfid, gfid); + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) { + goto err; + } + + if (glfs_h_mknod_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, 0777, NULL, gfid, dict) == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failure for Doing mknod for file %s\n", + ri->rec.entry); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing mknod for file %s \n", + ri->rec.entry); + + } else if (ri->rec.op == GF_FOP_MKDIR) { + + uuid_t gfid; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing mkdir for dir %s \n", + ri->rec.gfid); + + // TBD - add mode and flags later + uuid_parse(ri->rec.gfid, gfid); + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) { + goto err; + } + + if (glfs_h_mkdir_with_xdata(ctx->fs, obj, ri->rec.entry, 0777, NULL, gfid, dict) != 0) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failure for Doing mkdir for file %s\n", + ri->rec.entry); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing mkdir for file %s \n", + ri->rec.entry); + + } else if ((ri->rec.op == GF_FOP_RMDIR) || (ri->rec.op == GF_FOP_UNLINK)) { + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing rmdir/ublink for dir %s \n", + ri->rec.entry); + + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) { + goto err; + } + if (glfs_h_unlink_with_xdata(ctx->fs, obj, ri->rec.entry, dict) != 0) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failure for Doing rmdir/unlink for file %s\n", + ri->rec.entry); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing rmdir/unlink for file %s \n", + ri->rec.entry); + + } else if (ri->rec.op == GF_FOP_SYMLINK) { + + uuid_t gfid; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing symlink for file %s to file %s \n", + ri->rec.entry, ri->rec.link_path); + + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) { + goto err; + } + uuid_parse(ri->rec.gfid, gfid); + + if (glfs_h_symlink_with_xdata(ctx->fs, obj, ri->rec.entry, ri->rec.link_path, NULL, gfid, dict) == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failed to Doing symlink for file %s to file %s \n", + ri->rec.entry, ri->rec.link_path); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing symlink for file %s to file %s \n", + ri->rec.entry, ri->rec.link_path); + + } else if (ri->rec.op == GF_FOP_LINK) { + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing hard link for file %s to file %s \n", + ri->rec.entry, ri->rec.gfid); + + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) { + goto err; + } + if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) { + goto err; + } + + if (glfs_h_link_with_xdata(ctx->fs, to_obj, obj, ri->rec.entry, dict) == -1) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failed to Doing hard link for file %s to file %s \n", + ri->rec.entry, ri->rec.gfid); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finsihed doing hard link for file %s to file %s \n", + ri->rec.entry, ri->rec.gfid); + + } else if (ri->rec.op == GF_FOP_RENAME) { + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing rename for file %s to file %s \n", + ri->rec.entry, ri->rec.newloc); + + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) { + goto err; + } + if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) { + goto err; + } + + if (glfs_h_rename_with_xdata(ctx->fs, obj, ri->rec.entry, to_obj, ri->rec.newloc, dict) == -1) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failed to Doing rename for file %s to file %s \n", + ri->rec.entry, ri->rec.newloc); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finsihed doing renam for file %s to file %s \n", + ri->rec.entry, ri->rec.newloc); + + + } else if ((ri->rec.op == GF_FOP_SETATTR) || (ri->rec.op == GF_FOP_FSETATTR)) { + + struct iatt iatt = {0, }; + int valid = 0; + int ret = -1; + + // TBD - do the actual settings once we do that + // right now we just set the mode so that changelog gets filled + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing attr for file %s \n", + ri->rec.gfid); + + if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) { + goto err; + } + + if (obj->inode->ia_type == IA_IFDIR) + fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict); + else + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict); + if (fd == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "open for file %s failed\n", + ri->rec.gfid); + goto err; + } + + iatt.ia_prot = ia_prot_from_st_mode(777); + valid = GF_SET_ATTR_MODE; + + + // Set one special dict flag to indicate the opcode so that + // the opcode gets set to this + if (dict_set_int32(dict,"recon-attr-opcode",ri->rec.op)) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "setting opcode to %d failed\n",ri->rec.op); + goto err; + } + + ret = glfs_fsetattr_with_xdata(fd, &iatt, valid, dict); + if (ret == -1) { + nsr_worker_log(this->name, GF_LOG_INFO, + "failed Doing attr for file %s \n", + ri->rec.gfid); + goto err; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing attr for file %s \n", + ri->rec.gfid); + + } + + retval = _gf_true; + +err: + if (fd) { + /* + * It's not clear that we should be passing the same dict to + * glfs_close that was passed to us for glfs_open, but that's + * the prior behavior so let's preserve it for now. + */ + if (glfs_close_with_xdata(fd, dict) == -1) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "close failed\n"); + } + } + if (obj) { + /* + * AFAICT fd operations do not borrow this reference, so we + * still need to drop it ourselves. + */ + glfs_h_close(obj); + } + if (to_obj) { + /* + * AFAICT fd operations do not borrow this reference, so we + * still need to drop it ourselves. + */ + glfs_h_close(to_obj); + } + return retval; +} + +//return back opcodes that requires reading from source +static gf_boolean_t +recon_check_changelog(nsr_recon_record_details_t *rd) +{ + return((rd->op == GF_FOP_WRITE) || + (rd->op == GF_FOP_FSETATTR) || + (rd-> op == GF_FOP_SETATTR) || + (rd->op == GF_FOP_FREMOVEXATTR) || + (rd->op == GF_FOP_SETXATTR) || + (rd->op == GF_FOP_FSETXATTR) || + (rd->op == GF_FOP_SYMLINK)); + +} + +// TBD +static gf_boolean_t +recon_compute_undo(nsr_recon_record_details_t *rd) +{ + return(_gf_false); +} + + +/* + * Function that talks to the brick for data tranfer. + * + * Input arguments: + * ctx - worker context + * work - pointer to work object + */ +static void +data_worker_func(nsr_per_node_worker_t *ctx, + nsr_recon_work_t *work) +{ + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + xlator_t *this = dr->this; + nsr_reconciliation_record_t *ri = NULL; + nsr_recon_record_details_t *rd = NULL; + int wip = 0; + dict_t * dict = NULL; + struct glfs_fd *fd = NULL; + struct glfs_object *obj = NULL; + uuid_t gfid; + uint32_t k_s = 0, v_s = 0; + char *t_b= NULL; + uint32_t num=0; + + switch (work->req_id){ + case NSR_WORK_ID_INI: + nsr_worker_log(this->name, GF_LOG_INFO, + "started data ini \n"); + + if (nsr_recon_start_work(ctx, _gf_false) != 0) { + ctx->result = -1; + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished data ini \n"); + break; + case NSR_WORK_ID_FINI: + nsr_worker_log(this->name, GF_LOG_INFO, + "started data fini \n"); + + if (nsr_recon_end_work(ctx, _gf_false) != 0) { + ctx->result = -1; + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished data fini \n"); + break; + case NSR_WORK_ID_SINGLE_RECONCILIATION_READ: + // first_index always starts with 1 but records starts at 0. + wip = work->index - (dr->workers[0].recon_info->first_index); + ri = &(dr->workers[0].recon_info->records[wip]); + rd = &(ri->rec); + + dict = dict_new (); + if (!dict) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "failed allocating for dictionary\n"); + break; + } + if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + break; + } + if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + break; + } + + switch (rd->op) { + case GF_FOP_WRITE: + + // record already copied. + // copy data to this node's info. + + uuid_parse(ri->rec.gfid, gfid); + + nsr_worker_log(this->name, GF_LOG_INFO, + "started recon read for file %s at offset %d at len %d\n", + ri->rec.gfid, rd->offset, rd->len); + + obj = glfs_h_create_from_handle (ctx->fs, gfid, + GFAPI_HANDLE_LENGTH, + NULL); + if (obj == NULL) { + GF_ASSERT(obj != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "creating of handle failed\n"); + break; + } + + // The file has probably got deleted. + fd = glfs_h_open_with_xdata (ctx->fs, obj, O_RDONLY, + dict); + if (fd == NULL) { + GF_ASSERT(fd != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "opening of file failed\n"); + break; + } + + if (glfs_lseek_with_xdata (fd, rd->offset, SEEK_SET, + dict) != rd->offset) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "lseek of file failed to offset %d\n", + rd->offset); + break; + } + + ri->work.data = RD_CALLOC (rd->len , sizeof(char), + gf_mt_recon_work_data_t); + if (glfs_read_with_xdata (fd, ri->work.data, rd->len, + 0, dict) != rd->len) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "read of file failed to offset %d for bytes %d\n", + rd->offset, rd->len); + break; + } + break; + + case GF_FOP_FTRUNCATE: + case GF_FOP_SYMLINK: + case GF_FOP_RMDIR: + case GF_FOP_UNLINK: + case GF_FOP_MKNOD: + case GF_FOP_CREATE: + case GF_FOP_LINK: + case GF_FOP_MKDIR: + case GF_FOP_RENAME: + nsr_worker_log (this->name, GF_LOG_ERROR, + "unimplemented fop %u\n", rd->op); + break; + + case GF_FOP_FREMOVEXATTR: + case GF_FOP_REMOVEXATTR: + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + + uuid_parse(ri->rec.gfid, gfid); + + + // This is for all the set attribute/extended + // attributes commands. Get all the attributes from + // the source and fill it in the buffer as a NULL + // seperated key and value which are in turn seperated + // by NULL. + + nsr_worker_log(this->name, GF_LOG_INFO, + "doing getattr for gfid %s \n", + ri->rec.gfid); + + obj = glfs_h_create_from_handle (ctx->fs, gfid, + GFAPI_HANDLE_LENGTH, + NULL); + if (obj == NULL) { + GF_ASSERT(fd != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "creating of handle failed\n"); + break; + } + + if (obj->inode->ia_type == IA_IFDIR) + fd = glfs_h_opendir_with_xdata (ctx->fs, obj, + dict); + else + fd = glfs_h_open_with_xdata (ctx->fs, obj, + O_RDONLY, dict); + + if (fd == NULL) { + GF_ASSERT(fd != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "opening of file failed\n"); + break; + } + + if (get_xattr_total_size (fd, &t_b, &k_s, &v_s, &num, + dict) == -1) { + if (t_b) free(t_b); + nsr_worker_log(this->name, GF_LOG_ERROR, + "list of xattr of gfid %s failed\n", + rd->gfid); + break; + } + ri->work.data = RD_CALLOC ((k_s + v_s) , sizeof(char), + gf_mt_recon_work_data_t); + if (get_xattr(fd, t_b, ri->work.data, v_s, num, dict) == -1) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "get xattr of gfid %s failed\n", rd->gfid); + break; + } + ri->work.num = num; + nsr_worker_log(this->name, GF_LOG_INFO, + "finished getattr for gfid %s \n", + ri->rec.gfid); + free(t_b); + break; + + case GF_FOP_FSETATTR: + case GF_FOP_SETATTR: + //TBD - to get the actual attrbutes and fill + // mode, uid, gid, size, atime, mtime + nsr_worker_log (this->name, GF_LOG_ERROR, + "unimplemented fop %u\n", rd->op); + break; + default: + nsr_worker_log (this->name, GF_LOG_ERROR, + "unrecognized fop %u\n", rd->op); + + } + nsr_worker_log(this->name, GF_LOG_INFO, + "finished recon read for gfid %s at offset %d for %d bytes \n", + rd->gfid, rd->offset, rd->len); + break; + + case NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT: + // first_index always starts with 1 but records starts at 0. + wip = work->index - (dr->workers[0].recon_info->first_index); + ri = &(dr->workers[0].recon_info->records[wip]); + rd = &(ri->rec); + + nsr_worker_log(this->name, GF_LOG_INFO, + "got recon commit for index %d that has gfid %s \n", + wip, rd->gfid); + dict = dict_new (); + if (!dict) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "failed allocating for dictionary\n"); + break; + } + if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + break; + } + if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + break; + } + if (apply_record(ctx, ri, dict) == _gf_false) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "apply_record fails\n"); + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished recon commit for gfid %s \n", + rd->gfid); + break; + + case NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH: + dict = dict_new (); + if (!dict) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "failed allocating for dictionary\n"); + break; + } + if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + break; + } + if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) { + ctx->result = -1; + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + break; + } + + // Increment work index with the start index + wip = work->index - (dr->workers[0].recon_info->first_index); + ri = &(dr->workers[0].recon_info->records[wip]); + rd = &(ri->rec); + + glfs_fsync_with_xdata(fd, dict); + break; + + default: + nsr_worker_log (this->name, GF_LOG_ERROR, + "unrecognized request id %u\n", work->req_id); + } + + if (fd) { + glfs_close_with_xdata(fd, dict); + } + if (obj) { + glfs_h_close(obj); + } + if (dict) { + dict_unref(dict); + } +} + +// thread for doing data work +static void * +data_worker_main(nsr_per_node_worker_t *ctx) +{ + nsr_worker_log(this->name, GF_LOG_INFO, + "starting data worker func\n"); + init_worker(ctx, 0); + + while(1) { + nsr_recon_work_t *work = NULL; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + + nsr_worker_log(this->name, GF_LOG_INFO, + "waiting for work\n"); + + pthread_mutex_lock(&(ctx->mutex)); + while (list_empty(&(ctx->head.list))) { + pthread_cond_wait(&(ctx->cv), &(ctx->mutex)); + } + pthread_mutex_unlock(&(ctx->mutex)); + list_for_each_entry(work, &(ctx->head.list), list) { + nsr_worker_log(this->name, GF_LOG_INFO, + "got work with id %d\n",work->req_id); + work->in_use = _gf_false; + data_worker_func(ctx, work); + atomic_dec(&(dr->outstanding)); + break; + } + nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n"); + list_del_init (&work->list); + GF_FREE(work); + nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n"); + } + + return NULL; +} + + +//make recon work +static void +recon_make_work(nsr_recon_driver_ctx_t *ctx, + nsr_recon_work_t **work, + nsr_recon_work_req_id_t req_id, + int32_t i) +{ + xlator_t *this = ctx->this; + + // TBD - change this to get from a static pool + // This cannot fail + (*work) = RD_CALLOC (1, sizeof (nsr_recon_work_t), gf_mt_recon_work_t); + (*work)->req_id = req_id; + (*work)->index = i; + (*work)->in_use = _gf_true; + INIT_LIST_HEAD(&((*work)->list)); + return; +} + +// Schedule a work object to a worker thread. +static void +recon_queue_to_worker(nsr_recon_driver_ctx_t *ctx, + nsr_recon_work_t *work, + unsigned int id, + nsr_recon_queue_type_t type) +{ + nsr_per_node_worker_t *worker; + if (type == NSR_RECON_QUEUE_TO_CONTROL) { + worker = ctx->workers[id].control_worker; + nsr_driver_log(this->name, GF_LOG_INFO, + "queueing work to control index %d\n",id); + } else { + worker= ctx->workers[id].data_worker; + nsr_driver_log(this->name, GF_LOG_INFO, + "queueing work to data index %d\n",id); + } + pthread_mutex_lock(&worker->mutex); + list_add_tail(&work->list, &worker->head.list); + pthread_cond_signal(&worker->cv); + pthread_mutex_unlock(&worker->mutex); + return; +} + +typedef void * (*F_t) (void *); + +// In case mode is set to NSR_USE_THREADS, create worker threads. +static gf_boolean_t +create_worker_threads(nsr_recon_private_t *priv, + nsr_recon_driver_ctx_t *ctx, + nsr_per_node_worker_t *w, + gf_boolean_t control_or_data, + F_t f, + uint32_t num) +{ + uint32_t i; + nsr_per_node_worker_t *worker = w; + xlator_t *this = ctx->this; + + for (i=0; i < num; i++) { + worker->id = RD_CALLOC(1, 10, gf_mt_recon_id_t); + if (!worker->id) { + nsr_driver_log (priv->this->name, GF_LOG_ERROR, "memory allocation error \n"); + return _gf_false; + } + sprintf(worker->id,"recon_%d", i); + worker->driver_ctx = ctx ; + + if (ctx->mode == NSR_USE_THREADS) { + if (pthread_create(&worker->thread_id, NULL, f, worker)) { + nsr_driver_log (ctx->this->name, GF_LOG_ERROR, "control work thread creation error \n"); + return _gf_false; + } + } + worker->index = i; + worker++; + } + return _gf_true; +} + +/* + * In case of thread, send the work item; else call the function directly. + * + * Input arguments: + * bm - bitmap containing indices of nodes we want to send work + * num - number of such indices + * ctx - driver context from where we derive per worker context + * id - request ID + * q - control or data + * misc - used to overload such as index. + */ +static void +send_and_wait(int32_t *result, + int32_t *op_errno, + int32_t bm, + uint32_t num, + nsr_recon_driver_ctx_t *ctx, + nsr_recon_work_req_id_t id, + nsr_recon_queue_type_t q, + int32_t misc) +{ + uint32_t i = 0; + nsr_recon_work_t *work; + +#define CONTROL_WORKER(i) ctx->workers[i].control_worker +#define DATA_WORKER(i) ctx->workers[i].data_worker +#define WORKER(i) ((q == NSR_RECON_QUEUE_TO_CONTROL) ? (CONTROL_WORKER(i)) : (DATA_WORKER(i))) + + *result = *op_errno = 0; + + for (i=0; i < num; i++) { + if ((bm & (1 << i)) && ctx->workers[i].in_use) { + WORKER(i)->result = 0; + WORKER(i)->op_errno = 0; + } + } + if (ctx->mode == NSR_SEQ) { + for (i=0; i < num; i++) { + if ((bm & (1 << i)) && ctx->workers[i].in_use) { + recon_make_work(ctx,&work, id, misc); + if (q == NSR_RECON_QUEUE_TO_CONTROL) { + if (i == 0) + control_worker_func_0(ctx->workers[0].control_worker, work); + else + control_worker_func(ctx->workers[i].control_worker, work); + } else { + data_worker_func(ctx->workers[i].data_worker, work); + } + GF_FREE(work); + } + } + goto out; + } + + for (i=0; i < num; i++) { + if ((bm & (1 << i)) && ctx->workers[i].in_use) { + recon_make_work(ctx,&work, id, misc); + atomic_inc(&(ctx->outstanding)); + recon_queue_to_worker(ctx, work, i, q); + } + } + + nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: waiting\n"); + while (ctx->outstanding) { + pthread_yield(); + } +out: + for (i=0; i < num; i++) { + if ((bm & (1 << i)) && ctx->workers[i].in_use) { + if (WORKER(i)->result == -1) { + *result = -1; + } + } + } + if (*result == -1) { + for (i=0; i < num; i++) { + if ((bm & (1 << i)) && ctx->workers[i].in_use) { + if (WORKER(i)->op_errno == EAGAIN) { + *op_errno = EAGAIN; + break; + } else { + *op_errno = EIO; + } + } + } + } + + nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: all workers have returned with result: %d errno:%d\n", *result, *op_errno); + return; +} + +// send INI or FINI +static int32_t +nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx, + uint32_t i, + gf_boolean_t in_use) +{ + uint32_t bm = 1 << i; + gf_boolean_t send = _gf_false; + int32_t status =0, op_errno = 0; + + send = (ctx->workers[i].in_use != in_use); + ctx->workers[i].in_use = in_use; + + if (!send) { + return 0; + } + nsr_driver_log (this->name, GF_LOG_INFO, + "sending %s to index %d\n",in_use?"INI":"FINI",i); + + send_and_wait(&status, &op_errno, bm, ctx->replica_group_size, ctx, + (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI, + NSR_RECON_QUEUE_TO_CONTROL, -1); + if (status == -1) + goto err; + + send_and_wait(&status, &op_errno, bm, ctx->replica_group_size, ctx, + (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI, + NSR_RECON_QUEUE_TO_DATA, -1); + if (status == -1) + goto err; + + /* + * We really need better error recovery. To activate a worker, we + * allocate memory and send two messages. If any of those actions + * fail, we should undo the others. It would probably be good to + * collapse the two messages into one, because it's pretty trivial to + * allocate a temporary structure and either link it in or free it + * depending on the result here. + */ + + if (in_use == _gf_false) { + GF_FREE(ctx->workers[i].recon_info); + } + + return 0; + +err: + GF_FREE(ctx->workers[i].recon_info); + ctx->workers[i].recon_info = NULL; + return -1; +} + +gf_boolean_t +nsr_recon_driver_reconciliator (nsr_recon_private_t *priv) +{ + uint32_t replica_group_size = priv->replica_group_size; + uint32_t i; + nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context; + int32_t bm; + int32_t status = 0; + int32_t op_errno = 0; + gf_boolean_t do_recon = _gf_false; + uint32_t start_index = ctx->workers[0].recon_info->first_index; + uint32_t end_index = ctx->workers[0].recon_info->last_index; + uint32_t num = ((start_index == 0) && (end_index == 0)) ? 0 : (end_index - start_index + 1); + + nsr_driver_log (this->name, GF_LOG_INFO, + "starting reconciliation work as reconciliator \n"); + + // nothing to be done? signal back to the recon translator that this + // phase done. + bm = 1; + for (i=1; i < replica_group_size; i++) { + if (ctx->workers[i].in_use && + (ctx->workers[0].recon_info->last_term == ctx->workers[i].recon_info->last_term)) { + ctx->workers[i].recon_info->last_index = end_index; + ctx->workers[i].recon_info->first_index = start_index; + bm |= (1 << i); + do_recon = _gf_true; + } + } + + if (!do_recon || !num) { + nsr_driver_log (this->name, GF_LOG_INFO, + "nothing needs to be done as resolutor \n"); + return _gf_true; + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "getting reconciliation window for term %d from %dto %d \n", + ctx->workers[0].recon_info->last_term, + start_index, end_index); + // We have set the bm in the above for loop where we go thru all nodes + // including this node that have seen the last term. + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_GET_RECONCILATION_WINDOW, + NSR_RECON_QUEUE_TO_CONTROL, -1); + if (status == -1) + return _gf_false; + nsr_driver_log (this->name, GF_LOG_INFO, + "finished getting reconciliation window for term %d from %dto %d \n", + ctx->workers[0].recon_info->last_term, + start_index, end_index); + + + // from the changelogs, calculate the entries that need action and the + // source for each of these entries + compute_reconciliation_work(ctx); + + // for each of the entries that need fixup, issue IO + for (i=start_index; i < (start_index + num); i++) { + nsr_reconciliator_info_t *my_recon_info = + ctx->workers[0].recon_info; + nsr_reconciliation_record_t *record = + &(my_recon_info->records[i - start_index]); + + record->work.term = ctx->workers[0].recon_info->last_term; + record->work.index = i; + + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing index %d\n",i); + if ((record->work.type == NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE) || + (record->work.type == NSR_RECON_WORK_HOLE_TO_FILL)) { + // 1st case (RECON_WORK_HOLE_TO_PSEUDO_HOLE): If there + // are only pseudo_holes in others, it is best effort. + // Just pick from the first node that has it and + // proceed. + // 2nd case (RECON_WORK_HOLE_TO_FILL): this node has + // either a HOLE or PSUEDO_HOLE and some one else has a + // FILL(source). analyse the changelog to check if data + // needs to be read or if the log has all the data + // required + + if (recon_check_changelog(&record->rec)) { + bm = (1 << record->work.source); + nsr_driver_log (this->name, GF_LOG_INFO, + "reading data from source %d\n",record->work.source); + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_READ, + NSR_RECON_QUEUE_TO_DATA, + i); + if (status == -1) + return _gf_false; + nsr_driver_log (this->name, GF_LOG_INFO, + "got data from source %d\n",record->work.source); + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing local data as part of reconciliation\n"); + + bm = 1; + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT, + NSR_RECON_QUEUE_TO_DATA, + i); + if (status == -1) + return _gf_false; + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished fixing local data as part of reconciliation\n"); + + } else if (record->work.type == NSR_RECON_WORK_COMPARE_PSEUDO_HOLE) { + // this node has a pseudo_hole and some others have just + // that too. Just convert this to FILL. let others + // blindly pick it from here. + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing this record as a fill\n"); + bm = 1; + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH, + NSR_RECON_QUEUE_TO_DATA, + i); + if (status == -1) + return _gf_false; + nsr_driver_log (this->name, GF_LOG_INFO, + "finished fixing this record as a fill\n"); + } + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished reconciliation work as reconciliator \n"); + + // tbd - mark this term golden in the reconciliator + return _gf_true; +} + +gf_boolean_t +nsr_recon_driver_resolutor (nsr_recon_private_t *priv) +{ + uint32_t replica_group_size = priv->replica_group_size; + uint32_t i; + nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context; + int32_t bm; + int32_t status = 0; + int32_t op_errno = 0; + // This node's last term is filled when it gets a message from the + // leader to act as a reconciliator. + uint32_t recon_index = ctx->reconciliator_index; + nsr_reconciliator_info_t *my_info = + ctx->workers[0].recon_info; + nsr_reconciliator_info_t *his_info = + ctx->workers[recon_index].recon_info; + uint32_t my_last_term = my_info->last_term; + uint32_t to_do_term = his_info->last_term; + uint32_t my_start_index = 1, my_end_index = 1; + uint32_t his_start_index = 1, his_end_index = 1; + uint32_t num = 0; + gf_boolean_t fl = _gf_true; + + nsr_driver_log (this->name, GF_LOG_INFO, + "starting resolutor work with reconciliator as %d from term %d to term %d \n", + recon_index, my_last_term, to_do_term); + + do { + + if (!fl) { + (his_info->last_term)++; + (my_info->last_term)++; + } else { + his_info->last_term = my_last_term; + } + + nsr_driver_log (this->name, GF_LOG_INFO, "resolving term %d \n", my_info->last_term); + + // Get reconciliator's term information for that term + nsr_driver_log (this->name, GF_LOG_INFO, + "getting info from reconciliator for term %d \n", my_info->last_term); + bm = (1 << recon_index); + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_GET_GIVEN_TERM_INFO, + NSR_RECON_QUEUE_TO_CONTROL, his_info->last_term); + if (status == -1) + return _gf_false; + nsr_driver_log (this->name, GF_LOG_INFO, + "finished getting info from reconciliator for term %d \n", my_info->last_term); + + + // empty term + if (!his_info->commited_ops) { + nsr_driver_log (this->name, GF_LOG_INFO, + "reconciliator for term %d is empty. moving to next term. \n", my_info->last_term); + // TBD - mark the term golden + fl = _gf_false; + continue; + } + + // calculate the resolution window boundary. for the last term + // this node saw, we compare the resolution window of this and + // reconciliator. for the rest of the nodes, we just accept the + // reconciliator info. + if (fl) { + my_start_index = my_info->first_index; + my_end_index = my_info->last_index; + his_start_index = his_info->first_index; + his_end_index = his_info->last_index; + my_info->first_index = (my_start_index < his_start_index) ? my_start_index : his_start_index; + my_info->last_index = (my_end_index > his_end_index) ? my_end_index : his_end_index; + } else { + my_info->first_index = his_info->first_index; + my_info->last_index = his_info->last_index; + my_info->commited_ops = his_info->commited_ops; + } + if (my_info->first_index == 0) + my_info->first_index = 1; + num = (my_info->last_index - my_info->first_index) + 1; + + + // Get the logs from the reconciliator (and this node for this + // term) + if (fl) + bm = ((1 << recon_index) | 1); + else + bm = (1 << recon_index); + + nsr_driver_log (this->name, GF_LOG_INFO, + "getting reconciliation window for term %d from %d to %d \n", + my_info->last_term, + my_info->first_index, my_info->last_index); + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_GET_RECONCILATION_WINDOW, + NSR_RECON_QUEUE_TO_CONTROL, -1); + if (status == -1) + return _gf_false; + nsr_driver_log (this->name, GF_LOG_INFO, + "finished getting reconciliation window for term %d from %d to %d \n", + my_info->last_term, + my_info->first_index, my_info->last_index); + + // from the changelogs, calculate the entries that need action + compute_resolution_work(ctx, my_info, his_info, !fl); + + + // for each of the entries that need fixup, issue IO + for (i=my_info->first_index; i < (my_info->first_index + num); i++) { + nsr_reconciliation_record_t *record = + &(my_info->records[i - my_info->first_index]); + + record->work.term = my_info->last_term; + record->work.index = i; + + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing index %d\n",i); + if ((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) || + (record->work.type == NSR_RECON_WORK_UNDO_FILL)) { + if (((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) && + recon_check_changelog(&record->rec)) || + ((record->work.type == NSR_RECON_WORK_UNDO_FILL) && + recon_compute_undo(&record->rec))) { + nsr_driver_log (this->name, GF_LOG_INFO, + "reading data from source %d\n",recon_index); + bm = (1 << recon_index); + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_READ, + NSR_RECON_QUEUE_TO_DATA, + i); + if (status == -1) + return _gf_false; + nsr_driver_log (this->name, GF_LOG_INFO, + "finished reading data from source %d\n",recon_index); + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing local data as part of resolutor\n"); + + bm = 1; + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT, + NSR_RECON_QUEUE_TO_DATA, + i); + if (status == -1) + return _gf_false; + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished fixing local data as part of resolutor\n"); + } + } + fl = _gf_false; + + // tbd - mark this term golden in the reconciliator + } while (my_last_term++ != to_do_term); + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished resolutor work \n"); + return _gf_true; +} + +gf_boolean_t +nsr_recon_driver_leader (nsr_recon_private_t *priv) +{ + uint32_t replica_group_size = priv->replica_group_size; + uint32_t i; + nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context; + int32_t bm; + int32_t status = 0; + int32_t op_errno = 0; + int32_t chosen = -1; + int32_t last_term = -1, last_ops = -1; + + nsr_driver_log (this->name, GF_LOG_INFO, + "getting last term info from all members of this group\n"); + // Get last term info from all members for this group + send_and_wait(&status, &op_errno, -1, + replica_group_size, + ctx, + NSR_WORK_ID_GET_LAST_TERM_INFO, + NSR_RECON_QUEUE_TO_CONTROL, ctx->current_term); + if (status == -1) + return _gf_false; + + + // compare all the info received and choose the reconciliator First + // choose all with latest term + for (i=0; i < replica_group_size; i++) { + if (ctx->workers[i].in_use) { + if (ctx->workers[i].recon_info->last_term > last_term) { + last_term = ctx->workers[i].recon_info->last_term; + } + } + } + // First choose all with latest term and highest ops + for (i=0; i < replica_group_size; i++) { + if ((ctx->workers[i].in_use) && (last_term == ctx->workers[i].recon_info->last_term)) { + if (ctx->workers[i].recon_info->commited_ops > last_ops) { + last_ops = ctx->workers[i].recon_info->commited_ops; + } + } + } + // choose the first among the lot + for (i=0; i < replica_group_size; i++) { + if ((ctx->workers[i].in_use) && + (last_term == ctx->workers[i].recon_info->last_term) && + (last_ops == ctx->workers[i].recon_info->commited_ops)) { + chosen = i; + break; + } + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "reconciliator chosen is %d\n", chosen); + ctx->reconciliator_index = chosen; + GF_ASSERT(chosen != -1); + if (chosen == -1) { + nsr_driver_log (this->name, GF_LOG_INFO, + "no reconciliatior chosen\n"); + return _gf_false; + } + + // send the message to reconciliator to do reconciliation with list of + // nodes that are part of this quorum + if (chosen != 0) { + nsr_driver_log (this->name, GF_LOG_INFO, + "sending reconciliation work to %d\n", chosen); + bm = 1 << ctx->reconciliator_index; + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_RECONCILIATOR_DO_WORK, + NSR_RECON_QUEUE_TO_CONTROL, -1); + if (status == -1) + return _gf_false; + nsr_driver_log (this->name, GF_LOG_INFO, + "finished reconciliation work to %d\n", chosen); + } else { + nsr_driver_log (this->name, GF_LOG_INFO, + "local node is reconciliator. before set jmp\n"); + nsr_recon_driver_reconciliator(priv); + } + + // send message to all other nodes to sync up with the reconciliator + // including itself if required + // requires optimisation - TBD + if (chosen != 0) { + nsr_driver_log (this->name, GF_LOG_INFO, + "local node resolution needs to be done. before set jmp\n"); + nsr_recon_driver_resolutor(priv); + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "sending resolution work to all nodes except this node and reconciliator\n"); + bm = ~((1 << ctx->reconciliator_index) || 1); + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_RESOLUTION_DO_WORK, + NSR_RECON_QUEUE_TO_CONTROL, -1); + if (status == -1) + return _gf_false; + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished reconciliation work as leader \n"); + return _gf_true; +} + +// main recon driver thread +void * +nsr_reconciliation_driver(void *arg) +{ + nsr_recon_private_t *priv = (nsr_recon_private_t *) arg; + uint32_t replica_group_size = priv->replica_group_size; + uint32_t i; + nsr_per_node_worker_t *control_s, *data_s; + nsr_recon_driver_ctx_t **driver_ctx, *ctx; + int32_t bm; + xlator_t *this = priv->this; + char *con_name, *data_name; + int32_t status = 0; + int32_t op_errno = 0; + + driver_ctx = &priv->driver_thread_context; + (*driver_ctx) = GF_CALLOC (1, + sizeof (nsr_recon_driver_ctx_t), + gf_mt_recon_driver_ctx_t); + if (!driver_ctx) { + gf_log (this->name, GF_LOG_ERROR, "memory allocation error \n"); + return NULL; + } + ctx = *driver_ctx; + ctx->this = priv->this; + ctx->replica_group_size = replica_group_size; + + ctx->fp = recon_create_log (priv->replica_group_members[0], "nsr-driver-log"); + if (!ctx->fp) + return NULL; + + if ((pthread_mutex_init(&(ctx->mutex), NULL)) || + (pthread_cond_init(&(ctx->cv), NULL))){ + nsr_driver_log (this->name, GF_LOG_ERROR, "mutex init error \n"); + return NULL; + } + INIT_LIST_HEAD(&(ctx->role_head.list)); + + ctx->workers = RD_CALLOC (replica_group_size, + sizeof(nsr_replica_worker_t), + gf_mt_recon_worker_t); + if (!ctx->workers) { + nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n"); + return NULL; + } + for (i=0; i < replica_group_size; i++) { + strcpy(ctx->workers[i].name, priv->replica_group_members[i]); + } + + control_s = RD_CALLOC (replica_group_size, + sizeof(nsr_per_node_worker_t), + gf_mt_recon_per_node_worker_t); + if (!control_s) { + nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n"); + return NULL; + } + + data_s = RD_CALLOC (replica_group_size, + sizeof(nsr_per_node_worker_t), + gf_mt_recon_per_node_worker_t); + if (!data_s) { + nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n"); + return NULL; + } + for (i=0; i < replica_group_size; i++) { + ctx->workers[i].control_worker = &control_s[i]; + if (asprintf(&con_name,"recon-con-%u",i) < 1) { + return NULL; + } + ctx->workers[i].control_worker->fp = recon_create_log + (priv->replica_group_members[0], con_name); + if (!ctx->workers[i].control_worker->fp) + return NULL; + ctx->workers[i].data_worker = &data_s[i]; + if (asprintf (&data_name,"recon-data-%u",i) <1) { + return NULL; + } + ctx->workers[i].data_worker->fp = recon_create_log + (priv->replica_group_members[0], data_name); + if (!ctx->workers[i].data_worker->fp) + return NULL; + } + + nsr_driver_log (this->name, GF_LOG_INFO, "creating threads \n"); + // Create the worker threads + // For every brick including itself there will be 2 worker threads: + // one for data and one for control + if (!create_worker_threads(priv, ctx, control_s, _gf_true, + (F_t) control_worker_main, replica_group_size) || + !create_worker_threads(priv, ctx, data_s, _gf_false, + (F_t) data_worker_main, replica_group_size)) { + return NULL; + } + + for (i=0; i < replica_group_size; i++) { + nsr_recon_get_file(priv->volname, &(ctx->workers[i])); + } + + while (1) { + + nsr_role_work_t *rr; + + nsr_driver_log (this->name, GF_LOG_INFO, "waiting for role to be queued \n"); + pthread_mutex_lock(&(ctx->mutex)); + while (list_empty(&(ctx->role_head.list))) { + pthread_cond_wait(&(ctx->cv), &(ctx->mutex)); + } + pthread_mutex_unlock(&(ctx->mutex)); + + list_for_each_entry(rr, &(ctx->role_head.list), list) { + nsr_recon_driver_state_t state; + state = nsr_recon_driver_get_role(&status, ctx, rr); + + if (status == -1) { + op_errno = EIO; + goto out; + } + + switch (state) { + + case leader: + if (!nsr_recon_driver_leader(priv)) { + goto out; + } + break; + case reconciliator: + if (!nsr_recon_driver_reconciliator(priv)) { + goto out; + } + break; + case resolutor: + if (!nsr_recon_driver_resolutor(priv)) { + goto out; + } + break; + + case joiner: + + nsr_driver_log (this->name, GF_LOG_INFO, "getting last term info from all members of this group\n"); + // Get last term info from all members for this group + // which will be the leader(this node) and the node that wants to join. + send_and_wait(&status, &op_errno, -1, + replica_group_size, + ctx, + NSR_WORK_ID_GET_LAST_TERM_INFO, + NSR_RECON_QUEUE_TO_CONTROL, ctx->current_term); + if (status == -1) + goto out; + + + // send message to other node that just joined to sync up with this node which is also the leader + nsr_driver_log (this->name, GF_LOG_INFO, "sending resolution work to all nodes except this\n"); + bm = ~(1); + send_and_wait(&status, &op_errno, bm, + replica_group_size, + ctx, + NSR_WORK_ID_RESOLUTION_DO_WORK, + NSR_RECON_QUEUE_TO_CONTROL, -1); + if (status == -1) + goto out; + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished recon work as joiner \n"); + break; + + default: + nsr_driver_log (this->name, GF_LOG_ERROR, + "bad state %d", state); + } + + + // free the asasociated recon_info contexts created as part of this role + +out: + nsr_driver_log (this->name, GF_LOG_INFO, + "sending end of reconciliation message \n"); + nsr_recon_return_back(priv, ctx->term, status, op_errno); + nsr_driver_log (this->name, GF_LOG_INFO, + "finished sending end of reconciliation message \n"); + } + list_del_init (&rr->list); + } + + return NULL; +} diff --git a/xlators/cluster/nsr-recon/src/recon_driver.h b/xlators/cluster/nsr-recon/src/recon_driver.h new file mode 100644 index 000000000..3efb26269 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/recon_driver.h @@ -0,0 +1,325 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __RECON_DRIVER_H__ +#define __RECON_DRIVER_H__ + + +#include "api/src/glfs.h" + +#define MAX_HOSTNAME_LEN 32 +#define MAXIMUM_REPLICA_STRENGTH 8 +#define MAX_RECONCILIATION_WINDOW_SIZE 10000 + +#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd" +#define GLUSTERD_VOLUME_DIR_PREFIX "vols" +#define GLUSTERD_BRICK_INFO_DIR "bricks" + +/* + * Even with the names fixed, the non-NSR_DEBUG definitions of nsr_*_log don't + * work because many callers don't have "this" defined. + * + * TBD: use gf_log, fix "this" problem, eliminate extra fields and newlines. + */ +#define NSR_DEBUG + +typedef enum nsr_recon_work_req_id_t { + NSR_WORK_ID_GET_NONE = 0, + NSR_WORK_ID_GET_LAST_TERM_INFO = NSR_WORK_ID_GET_NONE + 1, + NSR_WORK_ID_GET_GIVEN_TERM_INFO = NSR_WORK_ID_GET_LAST_TERM_INFO + 1, + NSR_WORK_ID_RECONCILIATOR_DO_WORK = NSR_WORK_ID_GET_GIVEN_TERM_INFO + 1, + NSR_WORK_ID_RESOLUTION_DO_WORK = NSR_WORK_ID_RECONCILIATOR_DO_WORK + 1, + NSR_WORK_ID_GET_RECONCILATION_WINDOW = NSR_WORK_ID_RESOLUTION_DO_WORK + 1, + NSR_WORK_ID_SINGLE_RECONCILIATION_READ = NSR_WORK_ID_GET_RECONCILATION_WINDOW + 1, + NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT = NSR_WORK_ID_SINGLE_RECONCILIATION_READ + 1, + NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH = NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT + 1, + NSR_WORK_ID_GET_RESOLUTION_WINDOW = NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH + 1, + NSR_WORK_ID_END_RECONCILIATION = NSR_WORK_ID_GET_RESOLUTION_WINDOW + 1, + NSR_WORK_ID_INI = NSR_WORK_ID_END_RECONCILIATION + 1, + NSR_WORK_ID_FINI = NSR_WORK_ID_INI + 1 +} nsr_recon_work_req_id_t; + +typedef enum nsr_recon_queue_type_t { + NSR_RECON_QUEUE_TO_CONTROL = 0, + NSR_RECON_QUEUE_TO_DATA =NSR_RECON_QUEUE_TO_CONTROL + 1, +} nsr_recon_queue_type_t; + +typedef enum nsr_log_type_t { + NSR_LOG_HOLE = 0b0, + NSR_LOG_PSEUDO_HOLE = 0b1, + NSR_LOG_FILL = 0b11 +} nsr_log_type_t; + +typedef enum nsr_mode_t { + NSR_SEQ = 0, + NSR_USE_THREADS = 1, + NSR_ASYNC = 2 +} nsr_mode_t; + +typedef enum nsr_recon_work_type_t { + NSR_RECON_WORK_NONE = 0, + NSR_RECON_WORK_HOLE_TO_NOOP = NSR_RECON_WORK_NONE + 1, + NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_NOOP + 1, + NSR_RECON_WORK_COMPARE_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE + 1, + NSR_RECON_WORK_HOLE_TO_FILL = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE + 1, + NSR_RECON_WORK_UNDO_FILL = NSR_RECON_WORK_HOLE_TO_FILL + 1, +} nsr_recon_work_type_t; + +typedef enum nsr_recon_driver_state_t { + none = 0, + leader = 1, + reconciliator = 2, + resolutor = 3, + joiner = 4, +} nsr_recon_driver_state_t; + +// role structure +#pragma pack(push, 1) +typedef struct _nsr_recon_role_s { + uint32_t role; // leader, reconciliator, resolutor + uint32_t num; // required in case state is reconciliator + uint32_t current_term; // current term used in case of leader + // In case this is reconciliator, num is set to nodes that were part + // of previous term. + // In case this is resolutor, num is set to 2. + // info[0] - information for this node. + // info[1] - information of the reconciliator. + // In case this is leader, num is set to this term's membership list + // set info.name to all members including the leader + struct { + int32_t last_term; + int32_t commited_ops; + uint32_t last_index; + uint32_t first_index; + char name[MAX_HOSTNAME_LEN]; + } info[MAXIMUM_REPLICA_STRENGTH]; +} nsr_recon_role_t; +#pragma pack(pop) + +#define ENDIAN_CONVERSION_RR(rr, is_true) \ +{ \ + uint32_t i=0; \ + uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \ + if (is_true == _gf_true) rr.num = f(rr.num); \ + rr.current_term = f(rr.current_term); \ + for (i=0; i < rr.num; i++) { \ + rr.info[i].last_term = f(rr.info[i].last_term); \ + rr.info[i].commited_ops = f(rr.info[i].commited_ops); \ + rr.info[i].last_index = f(rr.info[i].last_index); \ + rr.info[i].first_index = f(rr.info[i].first_index); \ + } \ + if (is_true == _gf_false) rr.num = f(rr.num); \ +} + +// last term info structure +#pragma pack(push, 1) +typedef struct _nsr_recon_last_term_info_s { + int32_t last_term; + int32_t commited_ops; + uint32_t last_index; + uint32_t first_index; +} nsr_recon_last_term_info_t; +#pragma pack(pop) + +#define ENDIAN_CONVERSION_LT(lt, is_true) \ +{ \ + uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \ + lt.last_term = f(lt.last_term); \ + lt.commited_ops = f(lt.commited_ops); \ + lt.last_index = f(lt.last_index); \ + lt.first_index = f(lt.first_index); \ +} + +// log information +#pragma pack(push, 1) +typedef struct _nsr_recon_log_info_s { + uint32_t term; + uint32_t first_index; + uint32_t last_index; +} nsr_recon_log_info_t; +#pragma pack(pop) + +#define ENDIAN_CONVERSION_LI(li, is_true) \ +{ \ + uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \ + li.term = f(li.term); \ + li.first_index = f(li.first_index); \ + li.last_index = f(li.last_index); \ +} + +#pragma pack(push, 1) +typedef struct nsr_recon_record_details_s { + uint32_t type; + uint32_t op; + char gfid[36+1]; + char pargfid[36+1]; + char link_path[256]; // should it be PATH_MAX? + uint32_t offset; + uint32_t len; + char entry[128]; + char newloc[128]; // for rename. can you overload link_path for this? TBD + mode_t mode; +} nsr_recon_record_details_t; +#pragma pack(pop) + +#define ENDIAN_CONVERSION_RD(rd, is_true) \ +{ \ + uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \ + rd.type = f(rd.type); \ + rd.op = f(rd.op); \ + rd.offset = f(rd.offset); \ + rd.len = f(rd.len); \ +} + +typedef struct _nsr_role_work_s { + nsr_recon_role_t role; + uint32_t term; + struct list_head list; +} nsr_role_work_t; + +typedef struct _nsr_recon_work_s { + gf_boolean_t in_use; + uint32_t index; + uint32_t req_id; + struct list_head list; +} nsr_recon_work_t; + +typedef struct _nsr_reconciliation_work_s { + uint32_t term; + uint32_t index; + uint32_t type; + uint32_t source; + void *data; + + uint32_t num; // used for xattr + +} nsr_reconciliation_work_t; + +typedef struct _nsr_reconciliation_record_s { + nsr_reconciliation_work_t work; // will store the computed work + nsr_recon_record_details_t rec; +} nsr_reconciliation_record_t; + +typedef struct _nsr_reconciliator_info { + uint32_t reconcilator_index; + int32_t last_term; + int32_t commited_ops; + uint32_t last_index; + uint32_t first_index; + //nsr_reconciliation_record_t records[MAX_RECONCILIATION_WINDOW_SIZE]; + nsr_reconciliation_record_t *records; +} nsr_reconciliator_info_t; + +typedef struct _nsr_per_node_worker_s { + char *id; // identifier + char vol_file[256]; //volfile that will be used by this thread + glfs_t *fs; + glfs_fd_t *aux_fd; + uint32_t index; // index into array of workers + pthread_t thread_id; // thread id + void * context; // thread context + struct _nsr_recon_driver_ctxt *driver_ctx; + char local; // local data worker + //struct list_head list; //list of work items + nsr_recon_work_t head; + pthread_mutex_t mutex; //mutex to guard the state + pthread_cond_t cv; //condition variable for signaling the worker thread + gf_boolean_t is_control; +#if defined(NSR_DEBUG) + FILE *fp; +#endif + int32_t result; // result of latest work + int32_t op_errno; // errno +} nsr_per_node_worker_t; + +typedef struct _nsr_replica_worker_s { + char name[256]; + nsr_per_node_worker_t *control_worker; + nsr_per_node_worker_t *data_worker; + gf_boolean_t in_use; + nsr_reconciliator_info_t *recon_info; // Bunch of infos kept for this reconciliation +} nsr_replica_worker_t; + +typedef struct _nsr_recon_driver_ctxt { + xlator_t *this; + uint32_t replica_group_size; // number of static members of replica group + nsr_replica_worker_t *workers; // worker info + int32_t reconciliator; + pthread_mutex_t mutex; + pthread_cond_t cv; + nsr_role_work_t role_head; + volatile int32_t outstanding; + uint32_t reconciliator_index; + uint32_t term; + uint32_t current_term; + nsr_mode_t mode; // default set to seq +#if defined(NSR_DEBUG) + FILE *fp; +#endif +} nsr_recon_driver_ctx_t; + +void * +nsr_reconciliation_driver(void *); + +gf_boolean_t +nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx, nsr_recon_role_t *rr, uint32_t term); + +#define atomic_inc(ptr) ((void) __sync_fetch_and_add(ptr, 1)) +#define atomic_dec(ptr) ((void) __sync_fetch_and_add(ptr, -1)) +#define atomic_fetch_and __sync_fetch_and_and +#define atomic_fetch_or __sync_fetch_and_or + +#if defined(NSR_DEBUG) + +#define NSR_LOG_DIR "/var/log/nsr-logs" + +extern int nsr_debug_level; +extern FILE *recon_create_log (char *member, char *module); + +extern void +_nsr_driver_log (const char *func, int line, char *member, FILE *fp, + char *fmt, ...); + +#define nsr_driver_log(dom, levl, fmt...) do { \ + FMT_WARN (fmt); \ + if (levl <= nsr_debug_level) { \ + nsr_recon_private_t *priv = ctx->this->private; \ + _nsr_driver_log (__FUNCTION__, __LINE__, \ + priv->replica_group_members[0], \ + ctx->fp, \ + ##fmt); \ + } \ +} while (0) + +extern void +_nsr_worker_log (const char *func, int line, char *member, + char *type, uint32_t index, FILE *fp, + char *fmt, ...); + +#define nsr_worker_log(dom, levl, fmt...) do { \ + FMT_WARN (fmt); \ + if (levl <= nsr_debug_level) { \ + nsr_recon_private_t *priv; \ + priv = ctx->driver_ctx->this->private; \ + _nsr_worker_log (__FUNCTION__, __LINE__, \ + priv->replica_group_members[0], \ + ctx->is_control ? "recon-con" : \ + "recon-data", \ + ctx->index, ctx->fp, \ + ##fmt); \ + } \ +} while (0) + +#else +#define nsr_driver_log(dom, levl, fmt...) gf_log(dom, levl, fmt) +#define nsr_worker_log(dom, levl, fmt...) gf_log(dom, levl, fmt) +#endif + +#endif /* #ifndef __RECON_DRIVER_H__ */ diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.c b/xlators/cluster/nsr-recon/src/recon_xlator.c new file mode 100644 index 000000000..272c35dc2 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/recon_xlator.c @@ -0,0 +1,1010 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/types.h> +#include <fcntl.h> +#include <string.h> +#include <unistd.h> + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" + +#include "recon_driver.h" +#include "recon_xlator.h" + +typedef struct _nsr_recon_fd_s { + int32_t term; + nsr_recon_driver_state_t state; + uint32_t first_index; + uint32_t last_index; + call_frame_t *frame; +} nsr_recon_fd_t; + +#if defined(NSR_DEBUG) + +void +_recon_main_log (const char *func, int line, char *member, FILE *fp, + char *fmt, ...) +{ + va_list ap; + char *buf = NULL; + int retval; + + if (!fp) { + fp = recon_create_log(member,"recon-main-log"); + if (!fp) { + return; + } + } + + va_start(ap,fmt); + retval = vasprintf(&buf,fmt,ap); + if (buf) { + fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf); + free(buf); + } + va_end(ap); +} + +#endif + +// Given fd, get back the NSR based fd context. +static int32_t this_fd_ctx_get(fd_t *fd, xlator_t *this, nsr_recon_fd_t **rfd) +{ + uint64_t tmp = 0; + int32_t ret = -1; + + if ((ret = fd_ctx_get(fd, this, &tmp)) != 0) { + return ret; + } else { + *rfd = (nsr_recon_fd_t *)tmp; + return 0; + } +} + +// Add the frame in q after associating with term +// term usage tbd +static void put_frame(nsr_recon_private_t *priv, + call_frame_t *frame, + uint32_t term) +{ + xlator_t *this = priv->this; + recon_main_log (this->name, GF_LOG_INFO, "adding frame for term %d \n", term); + priv->frame = frame; + return; +} + +// get the frame from the queue given the term +// term usage tbd +static void get_frame(nsr_recon_private_t *priv, + call_frame_t **frame, + uint32_t term) +{ + if (frame != NULL) + *frame = priv->frame; + priv->frame = NULL; + return; +} + +// check if there are outstanding frames +static gf_boolean_t is_frame(nsr_recon_private_t *priv) +{ + return((priv->frame != NULL) ? _gf_true : _gf_false); +} + +#define ENTRY_SIZE 128 + +long +get_entry_count (char *path) +{ + int fd; + struct stat buf; + unsigned long entries = -1; + long min; /* last entry not known to be empty */ + long max; /* first entry known to be empty */ + long curr; + char entry[ENTRY_SIZE]; + void *err_label = &&done; + + fd = open(path,O_RDONLY); + if (fd < 0) { + goto *err_label; + } + err_label = &&close_fd; + + if (fstat(fd,&buf) < 0) { + goto *err_label; + } + + min = 0; + max = buf.st_size / ENTRY_SIZE; + printf("max = %ld\n",max); + + while ((min+1) < max) { + curr = (min + max) / 2; + printf("trying entry %ld\n",curr); + if (lseek(fd,curr*ENTRY_SIZE,SEEK_SET) < 0) { + goto *err_label; + } + if (read(fd,entry,sizeof(entry)) != sizeof(entry)) { + goto *err_label; + } + if ((entry[0] == '_') && (entry[1] == 'P')) { + min = curr; + } + else { + max = curr; + } + } + + entries = max; + +close_fd: + close(fd); +done: + return entries; +} + +// Get the term info for the term number specified +void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt) +{ + char path[PATH_MAX]; + long entries; + + bzero(lt, sizeof(nsr_recon_last_term_info_t)); + lt->last_term = term; + sprintf(path,"%s/%s%d",bp,"TERM.",term); + entries = get_entry_count(path); + if (entries > 1) { + /* The first entry is actually a header. */ + lt->first_index = 1; + /* + * This seems wrong, because it means that last_index*128 will + * be exactly at EOF and commited_ops will be one greater than + * it should be. Maybe some other code makes the exact + * opposite mistake to compensate. + */ + lt->last_index = lt->commited_ops = (int)entries; + } + recon_main_log (this->name, GF_LOG_INFO, "for term=%d got first_index=%d last_index=%d commited_ops=%d\n", + term, lt->first_index, lt->last_index, lt->commited_ops); + return; +} + +// Given the term number, find the last term in the changelogs +void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt) +{ + uint32_t t = term; + struct stat buf; + char path[PATH_MAX]; + bzero(lt, sizeof(nsr_recon_last_term_info_t)); + while(t) { + // journal file is of type TERM-1.jnl + sprintf(path,"%s/%s%d",bp,"TERM.",t); + if (!stat(path, &buf)) { + nsr_recon_libchangelog_get_this_term_info(this, bp, t, lt); + recon_main_log (this->name, GF_LOG_INFO, "got last term given current term %d as %d\n", term, t); + return; + } + t--; + } + recon_main_log (this->name, GF_LOG_INFO, "got no last term given current term %d \n", term); + + return; +} + +// Return back the frame stored against the term +void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t term, int32_t status, int32_t op_errno) +{ + call_frame_t *old_frame = NULL; + xlator_t *this = priv->this; + + get_frame(priv, &old_frame, term); + if (old_frame) { + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev returns old frame \n"); + // first return the original write for which this ack was sent + STACK_UNWIND_STRICT (writev, old_frame, status, op_errno, NULL, NULL, NULL); + } else { + recon_main_log (this->name, GF_LOG_ERROR, "EIII---nsr_recon_writev cnnot return old frame \n"); + } +} + +typedef enum records_type_t { + fop_gfid_pgfid_oldloc_newloc = 1, + fop_gfid_pgfid_entry = fop_gfid_pgfid_oldloc_newloc + 1, + fop_gfid = fop_gfid_pgfid_entry + 1 , + fop_gfid_offset = fop_gfid + 1, + fop_gfid_offset_len = fop_gfid_offset + 1, +} records_type_t; + +// Get the backend ./glusterfs/xx/xx/<...> path +static void +get_gfid_path(nsr_recon_private_t *priv, char *gfid, char *path) +{ + strcpy(path, priv->base_dir); + strcat(path, "/.glusterfs/"); + strncat(path,gfid,2); + strcat(path,"/"); + strncat(path,gfid+2,2); + strcat(path,"/"); + strcat(path,gfid); +} + + +// Get the link to which backend points to +static gf_boolean_t +get_link_using_gfid(nsr_recon_private_t *priv, char *gfid, char *path) +{ + char lp[PATH_MAX]; + xlator_t *this = priv->this; + get_gfid_path(priv,gfid, lp); + if (readlink(lp, path, 255) == -1) { + GF_ASSERT(0); + recon_main_log(priv->this, GF_LOG_ERROR, + "cannot get readlink for %s\n",lp); + return _gf_false; + } + return _gf_true; +} + +// Get the list of changelog records given a term , first and last index. +// +// TBD: rewrite this hideous ball of mud in at least the following ways: +// +// (1) Break out the code for handling a single record into a separate +// function, to make error handling easier and reduce "indentation +// creep" so the code's readable. +// +// (2) Change all of the fop_xxx_yyy nonsense to OR together values +// like FOP_HAS_FIELD_XXX and FOP_HAS_FIELD_YYY, to reduce code +// duplication and facilitate the addition of new fields. +// +// (3) Stop making so many assumptions about the underlying formats. +// The code as it is won't even work for the existing binary format, +// let alone as changelog evolves over time. +// +// Really, 90% of this code should just GO AWAY in favor of using +// libgfchangelog, enhanced as necessary to support our needs. + +/* + * Use this macro to skip over a field we're not using yet. + * NB: the body is a null statement on purpose + * TBD: all instances of this should be removed eventually! + */ +#define SKIP_FIELD do /* nothing */ ; while (*(start++) != '\0') + +#define SKIP_OVER +gf_boolean_t nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf) +{ + // do a mmap; seek into the first and read all records till last. + // TBD - right now all records are pseudo holes but mark them as fills. + // TBD - pseudo hole to be implemented when actual fsync gets done on data. + char *rb = NULL, *orig = NULL; + char path[PATH_MAX]; + int fd; + uint32_t index = 0; + + recon_main_log (this->name, GF_LOG_INFO, + "libchangelog_get_records called for term %d index from %d to %d \n", + term, first, last ); + + orig = rb = GF_CALLOC(128, ((last - first) + 1), + gf_mt_recon_changelog_buf_t); + + sprintf(path,"%s/%s%d",bp,"TERM.",term); + fd = open(path, O_RDONLY); + if (fd == -1) { + return _gf_false; + } else { + char *start = NULL; + nsr_recon_record_details_t * rec = (nsr_recon_record_details_t *)buf; + + if (first == 0) + lseek(fd, 128, SEEK_SET); + else + lseek(fd, first * 128, SEEK_SET); + if (read(fd, rb, (last - first + 1) * 128) == -1) { + return _gf_false; + } + start = rb; + index = first; + do { + recon_main_log (this->name, GF_LOG_INFO, + "libchangelog_get_records start inspecting records at index %d \n", + index ); + if (!strncmp(start, "_PRE_", 5)) { + uint32_t i; + uint32_t opcode = 0; + records_type_t type; + + start += 5; + // increment by the NULLs after the PRE + start += 4; + SKIP_FIELD; // real index + // now we have the opcode + while (*start != '\0') { + opcode *= 10; + opcode += (*(start++) - '0'); + } + ++start; + recon_main_log (this->name, GF_LOG_ERROR, + "libchangelog_get_records: got opcode %d @index %d\n", opcode, index); + if ((opcode == GF_FOP_RENAME)) { + type = fop_gfid_pgfid_oldloc_newloc; + } else if ((opcode == GF_FOP_UNLINK) || + (opcode == GF_FOP_RMDIR) || + (opcode == GF_FOP_LINK) || + (opcode == GF_FOP_MKDIR) || + (opcode == GF_FOP_SYMLINK) || + (opcode == GF_FOP_MKNOD) || + (opcode == GF_FOP_CREATE)) { + type = fop_gfid_pgfid_entry; + } else if ((opcode == GF_FOP_FSETATTR) || + (opcode == GF_FOP_SETATTR) || + (opcode == GF_FOP_FREMOVEXATTR) || + (opcode == GF_FOP_REMOVEXATTR) || + (opcode == GF_FOP_SETXATTR) || + (opcode == GF_FOP_FSETXATTR)) { + type = fop_gfid; + } else if ((opcode == GF_FOP_TRUNCATE) || + (opcode == GF_FOP_FTRUNCATE)) { + type = fop_gfid_offset; + } else if (opcode == GF_FOP_WRITE) { + type = fop_gfid_offset_len; + } else { + recon_main_log (this->name, + GF_LOG_ERROR, + "libchangelog_get_records:got no proper opcode %d @index %d\n", + opcode, index); + //GF_ASSERT(0); + // make this as a hole. + // TBD - check this logic later. maybe we should raise alarm here because + // this means that changelog is corrupted. We are not handling changelog + // corruptions as of now. + rec->type = NSR_LOG_HOLE; + goto finish; + } + // TBD - handle psuedo holes once that logic is in. + rec->type = NSR_LOG_FILL; + recon_main_log (this->name, GF_LOG_ERROR, + "libchangelog_get_records:got type %d at index %d \n", + rec->type, index); + rec->op = opcode; + + // Now get the gfid and parse it + // before that increment the pointer + for (i=0; i < 36; i++) { + rec->gfid[i] = (*start); + start++; + } + rec->gfid[i] = '\0'; + + GF_ASSERT(*start == 0); + start ++; + + if (opcode == GF_FOP_SYMLINK) { + i = 0; + do { + if (i >= 256) { + goto finish; + } + rec->link_path[i++] = *start; + } while (*(start++) != '\0'); + } + + i = 0; + // If type is fop_gfid_offset+_len, get offset + if ((type == fop_gfid_offset) || (type == fop_gfid_offset_len)) { + char offset_str[128]; + while(*start != 0) { + offset_str[i++] = *start; + start ++; + } + offset_str[i] = '\0'; + // get over the 0 + start++; + rec->offset = strtoul(offset_str, NULL, 10); + recon_main_log (this->name, + GF_LOG_ERROR, + "libchangelog_get_records:got offset %d @index %d \n", rec->offset, index); + + } + i = 0; + if (type == fop_gfid_offset_len) { + char len_str[128]; + while(*start != 0) { + len_str[i++] = *start; + start ++; + } + len_str[i] = '\0'; + // get over the 0 + start++; + rec->len = strtoul(len_str, NULL, 10); + recon_main_log (this->name, + GF_LOG_ERROR, + "libchangelog_get_records:got length %d @index %d \n", rec->len, index); + } + i = 0; + if (type == fop_gfid_pgfid_entry) { + switch (opcode) { + case GF_FOP_CREATE: + case GF_FOP_MKDIR: + case GF_FOP_MKNOD: + SKIP_FIELD; // mode + break; + /* TBD: handle GF_FOP_SYMLINK target */ + default: + ; + } + SKIP_FIELD; // uid + SKIP_FIELD; // gid + if (opcode == GF_FOP_MKNOD) { + SKIP_FIELD; // dev + } + // first get the gfid and then the path + for (i=0; i < 36; i++) { + rec->pargfid[i] = (*start); + start++; + } + rec->pargfid[i] = '\0'; + GF_ASSERT(*start == '/'); + start ++; + + i = 0; + while(*start != 0) { + rec->entry[i++] = *start; + start ++; + } + rec->entry[i] = '\0'; + // get over the 0 + start++; + /* + * Having to add this as a special case + * is awful. See the function header + * comment for the real solution. + */ + if (opcode == GF_FOP_CREATE) { + rec->mode = 0; + while (*start != '\0') { + rec->mode *= 10; + rec->mode += *start + - '0'; + ++start; + } + ++start; + } + recon_main_log (this->name, + GF_LOG_ERROR, + "libchangelog_get_records:got entry %s @index %d \n", rec->entry, index); + + } + i = 0; + if (type == fop_gfid_pgfid_oldloc_newloc) { + + // first get the source and then the destination + // source stuff gets stored in pargfid/entry + for (i=0; i < 36; i++) { + rec->pargfid[i] = (*start); + start++; + } + rec->pargfid[i] = '\0'; + GF_ASSERT(*start == '/'); + start ++; + + i=0; + while(*start != 0) { + rec->entry[i++] = *start; + start ++; + } + rec->entry[i] = '\0'; + // get over the 0 + start++; + + // dst stuff gets stored in gfid/newloc + for (i=0; i < 36; i++) { + rec->gfid[i] = (*start); + start++; + } + rec->gfid[i] = '\0'; + GF_ASSERT(*start == '/'); + start ++; + i = 0; + while(*start != 0) { + rec->newloc[i++] = *start; + start ++; + } + rec->newloc[i] = '\0'; + // get over the 0 + start++; + + } + ENDIAN_CONVERSION_RD((*rec), _gf_false); //htonl + } +finish: + if (index == last) + break; + index++; + rb += 128; + start = rb; + rec++; + } while(1); + } + GF_FREE(orig); + close(fd); + + recon_main_log (this->name, GF_LOG_INFO, + "libchangelog_get_records finsihed inspecting records for term %d \n", + term); + return _gf_true; +} + +int32_t +nsr_recon_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + nsr_recon_fd_t *rfd = NULL; + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open called for path %s \n",loc->path ); + rfd = GF_CALLOC (1, sizeof (*rfd), gf_mt_recon_fd_t); + if (!rfd) { + op_ret = -1; + op_errno = ENOMEM; + } + + op_ret = fd_ctx_set (fd, this, (uint64_t)(long)rfd); + if (op_ret) { + op_ret = -1; + op_errno = EINVAL; + } + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open returns with %d for path %s \n",op_ret,loc->path ); + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL); + return 0; +} + +int32_t +nsr_recon_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + nsr_recon_fd_t *rfd = NULL; + nsr_recon_private_t *priv = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + int32_t ret = 0; + + ret = this_fd_ctx_get (fd, this, &rfd); + if (ret < 0) { + return -1; + } + priv = (nsr_recon_private_t *)this->private; + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called for offset %d \n",(unsigned int)offset ); + GF_ASSERT(count == 1); + switch (offset) { + // client(brick, leader) writes the role of the node + case nsr_recon_xlator_sector_1 : + { + nsr_recon_role_t rr; + memcpy((void *)&rr, (void *)vector[0].iov_base, sizeof(rr)); + ENDIAN_CONVERSION_RR(rr, _gf_true); //ntohl + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called to set role %d\n", rr.role); + if ((rr.role != leader) && + (rr.role != reconciliator) && + (rr.role != resolutor) && + (rr.role != joiner)) { + recon_main_log (this->name, GF_LOG_ERROR, + "EIII---nsr_recon_writev cannot set state \n"); + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, + NULL, NULL, NULL); + } + + GF_ASSERT(rr.num <= MAXIMUM_REPLICA_STRENGTH); + + // Check if already a role play is going on. If yes return with EAGAIN. + // Ideally we should check if we have got a higher term number while + // servicing a lower term number; if so abort the older one. + // However the abort infrastructure needs to be sketched properly; TBD. + if (is_frame(priv) == _gf_true) { + recon_main_log (this->name, GF_LOG_ERROR, + "nsr_recon_writev set_role - already role play \n"); + STACK_UNWIND_STRICT (writev, frame, -1, EAGAIN, + NULL, NULL, NULL); + } else { + + // Store the stack frame so that when the actual job gets finished + // we send the response back to the brick. + put_frame(priv, frame, rr.current_term); + if (nsr_recon_driver_set_role(priv->driver_thread_context, + &rr, + rr.current_term) == _gf_false) { + get_frame(priv, NULL, rr.current_term); + recon_main_log (this->name, GF_LOG_ERROR, + "nsr_recon_writev set_role - cannot seem to set role \n"); + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, + NULL, NULL, NULL); + } else { + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_writev set_role - set role succesfully \n"); + } + } + break; + } + // client(reconciliator) writes how much it needs for the read + case nsr_recon_xlator_sector_2 : + { + nsr_recon_log_info_t li; + memcpy((void *)&li, (void *)vector[0].iov_base, sizeof(li)); + ENDIAN_CONVERSION_LI(li, _gf_true); //ntohl + + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_writev - setting term info for reconcilation info. term=%d, first_index=%d,start_index=%d \n", + li.term, li.first_index, li.last_index); + rfd->term = li.term; + rfd->last_index = li.last_index; + rfd->first_index = li.first_index; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + NULL, NULL, NULL); + break; + } + // client(reconciliator) writes term for which it needs info + case nsr_recon_xlator_sector_3 : + { + int32_t term; + + memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term)); + term = ntohl(term); //ntohl + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_writev - setting term info for term info. term=%d\n", + term); + rfd->term = term; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + NULL, NULL, NULL); + break; + } + // client(reconciliator) writes current term so that it gets last term info later + case nsr_recon_xlator_sector_4 : + { + int32_t term; + + memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term)); + term = ntohl(term); //ntohl + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_writev - setting term info for last term info given current term=%d\n", + term); + rfd->term = term; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + NULL, NULL, NULL); + break; + } + default: + { + recon_main_log (this->name, GF_LOG_ERROR, + "nsr_recon_writev called with wrong offset\n"); + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, + NULL, NULL, NULL); + break; + } + } + + return 0; +} + +int +nsr_recon_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + nsr_recon_fd_t *rfd = NULL; + int32_t op_errno = 0; + // copied stuff from quick-read.c and posix.c + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec iov = {0, }; + int32_t ret = -1; + nsr_recon_private_t *priv = NULL; + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto out; + } + + iobref = iobref_new (); + if (!iobref) { + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + ret = this_fd_ctx_get (fd, this, &rfd); + if (ret < 0) { + op_errno = -ret; + goto out; + } + priv = (nsr_recon_private_t *)this->private; + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_readv called for offset %d \n",(unsigned int)offset ); + switch (offset) { + // client(leader) reads from here to get info for this term on this node + // invole libchagelog to get the information + case nsr_recon_xlator_sector_3 : + { + nsr_recon_last_term_info_t lt; + GF_ASSERT(size == sizeof(lt)); + nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, rfd->term, <); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_readv - getting term info for term=%d, ops=%d, first=%d, last=%d\n", + rfd->term, lt.commited_ops, lt.first_index, lt.last_index); + ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl + memcpy(iobuf->ptr, <, size); + goto out; + } + // client(reconciliator) reads individual record information + case nsr_recon_xlator_sector_2 : + { + uint32_t num = (rfd->last_index - rfd->first_index + 1); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_readv - expected size %lu got size %lu\n", + (num * sizeof(nsr_recon_record_details_t)), size); + + GF_ASSERT(size == (num * sizeof(nsr_recon_record_details_t))); + bzero(iobuf->ptr, size); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_readv - getting records for term=%d from %d to %d\n", + rfd->term, rfd->first_index, rfd->last_index); + nsr_recon_libchangelog_get_records(this, priv->changelog_base_path, + rfd->term, rfd->first_index, rfd->last_index, iobuf->ptr); + goto out; + } + // read last term info + case nsr_recon_xlator_sector_4 : + { + nsr_recon_last_term_info_t lt; + GF_ASSERT(size == sizeof(lt)); + nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, rfd->term, <); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_readv - getting last term info given current term=%d. last term = %d ops=%d, first=%d, last=%d\n", + rfd->term, lt.last_term, lt.commited_ops, lt.first_index, lt.last_index); + ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl + memcpy(iobuf->ptr, <, size); + goto out; + } + default: + { + recon_main_log (this->name, GF_LOG_ERROR, + "nsr_recon_readv called with wrong offset\n"); + op_errno = -1; + break; + } + } + +out: + if (op_errno == 0) { + iov.iov_base = iobuf->ptr; + ret = iov.iov_len = size; + } + + STACK_UNWIND_STRICT (readv, frame, ret, op_errno, &iov, 1, NULL, iobref , NULL); + + if (iobref) + iobref_unref (iobref); + if (iobuf) + iobuf_unref (iobuf); + return 0; +} + +int +nsr_recon_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xdata) +{ + struct iatt buf = {0, }; + // dirty hack to set root as regular but seems to work. + buf.ia_type = IA_IFREG; + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_lookup called \n"); + + STACK_UNWIND_STRICT (lookup, frame, 0, 0, this->itable->root, &buf, NULL, NULL); + return 0; +} + + +int32_t +nsr_recon_flush (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *xdata) +{ + STACK_UNWIND_STRICT (flush, frame, 0, 0, NULL); + return 0; +} + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("recon", this, out); + + ret = xlator_mem_acct_init (this, gf_mt_recon_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Memory accounting init" "failed"); + return ret; + } +out: + return ret; +} + + +int32_t +init (xlator_t *this) +{ + nsr_recon_private_t *priv = NULL; + char *local, *members; + unsigned int i=0; + + priv = GF_CALLOC (1, sizeof (*priv), gf_mt_recon_private_t); + if (!priv) { + gf_log (this->name, GF_LOG_ERROR, + "priv allocation error\n"); + return -1; + } + GF_OPTION_INIT ("replica-group-size", priv->replica_group_size, uint32, err); + GF_OPTION_INIT ("vol-name", priv->volname, str, err); + if (!priv->volname) { + gf_log (this->name, GF_LOG_ERROR, + "missing volname option (required)"); + return -1; + } + GF_OPTION_INIT ("changelog-dir", priv->changelog_base_path, str, err); + if (!priv->changelog_base_path) { + gf_log (this->name, GF_LOG_ERROR, + "missing changelog directory option (required)"); + return -1; + } + GF_OPTION_INIT ("base-dir", priv->base_dir, str, err); + if (!priv->base_dir) { + gf_log (this->name, GF_LOG_ERROR, + "missing brick base directory option (required)"); + return -1; + } + GF_OPTION_INIT ("replica-group-members", members, str, err); + if (!members) { + gf_log (this->name, GF_LOG_ERROR, + "missing membership option (required)"); + return -1; + } + GF_OPTION_INIT ("local-member", local, str, err); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "missing local member option (required)"); + return -1; + } + + priv->replica_group_members = GF_CALLOC (priv->replica_group_size, + sizeof(char *), + gf_mt_recon_members_list_t); + priv->replica_group_members[0] = GF_CALLOC (1, + strlen(local), + gf_mt_recon_member_name_t); + if (!priv->replica_group_members || !(priv->replica_group_members[0])) { + gf_log (this->name, GF_LOG_ERROR, + "str allocation error\n"); + return -1; + } + strcpy(priv->replica_group_members[0], local); + for (i=1; i < priv->replica_group_size; i++) { + char *member; + if (i == 1) + member = strtok(members, ","); + else + member = strtok(NULL, ","); + priv->replica_group_members[i] = GF_CALLOC (1, + strlen(member) + 1, gf_mt_recon_member_name_t); + if (!priv->replica_group_members[i]) { + gf_log (this->name, GF_LOG_ERROR, + "str allocation error\n"); + return -1; + } + strcpy(priv->replica_group_members[i], member); + } + + + priv->this = this; + this->private = (void *)priv; + + priv->fp = recon_create_log (priv->replica_group_members[0], "recon-main-log"); + if (!priv->fp) + return -1; + + recon_main_log (this->name, GF_LOG_INFO, "creating reconciliation driver \n"); + + if (pthread_create(&priv->thread_id, NULL, nsr_reconciliation_driver, priv)) { + recon_main_log (this->name, GF_LOG_ERROR, + "pthread creation error \n"); + return -1; + } + + INIT_LIST_HEAD(&(priv->list)); + + + return 0; + +err: + return -1; +} + + +void +fini (xlator_t *this) +{ + nsr_recon_private_t *priv = NULL; + void *ret = NULL; + + priv = (nsr_recon_private_t *)this->private; + + pthread_cancel(priv->thread_id); + pthread_join(priv->thread_id, &ret); +} + + +struct xlator_fops fops = { + .open = nsr_recon_open, + .readv = nsr_recon_readv, + .writev = nsr_recon_writev, + .lookup = nsr_recon_lookup, + .flush = nsr_recon_flush +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"replica-group-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 2, + .max = INT_MAX, + .default_value = "2", + .description = "Number of bricks in replica group. can be derived but putting it here for testing." + }, + { + .key = {"vol-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "volume name" + }, + { + .key = {"local-member"}, + .type = GF_OPTION_TYPE_STR, + .description = "member(brick) for which this translator is responsible." + }, + { + .key = {"replica-group-members"}, + .type = GF_OPTION_TYPE_STR, + .description = "Comma seperated member names other than local." + }, + { + .key = {"changelog-dir"}, + .type = GF_OPTION_TYPE_STR, + .description = "Base directory where per term changelogs are maintained." + }, + { + .key = {"base-dir"}, + .type = GF_OPTION_TYPE_STR, + .description = "Base directory for this brick. This should go away once we fix gfid based lookups" + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.h b/xlators/cluster/nsr-recon/src/recon_xlator.h new file mode 100644 index 000000000..d9692a632 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/recon_xlator.h @@ -0,0 +1,92 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __RECON_XLATOR_H__ +#define __RECON_XLATOR_H__ + +#include <semaphore.h> +#include <pthread.h> + +enum gf_dht_mem_types_ { + gf_mt_recon_changelog_buf_t = gf_common_mt_end + 1, + gf_mt_recon_driver_ctx_t, + gf_mt_recon_fd_t, + gf_mt_recon_id_t, + gf_mt_recon_member_name_t, + gf_mt_recon_members_list_t, + gf_mt_recon_per_node_worker_t, + gf_mt_recon_private_t, + gf_mt_recon_reconciliator_info_t, + gf_mt_recon_record_t, + gf_mt_recon_record_details_t, + gf_mt_recon_role_work_t, + gf_mt_recon_work_t, + gf_mt_recon_work_data_t, + gf_mt_recon_worker_t, + gf_mt_recon_end, +}; + +enum nsr_recon_xlator_sector_t { + nsr_recon_xlator_sector_0 = 0, // to report back the status of given transaction ids + nsr_recon_xlator_sector_1 = 512, // to write here information about leadership changes from the brick + nsr_recon_xlator_sector_2 = (512 * 2), // to write here individual roles and wait for that role to be done + nsr_recon_xlator_sector_3 = (512 *3), // read from here to get term info for given term + nsr_recon_xlator_sector_4 = (512 * 4), // read from here to get last term info +}; + + +typedef struct _nsr_recon_private_s { + xlator_t *this; //back pointer + unsigned int replica_group_size; // number of static members of replica group + char **replica_group_members; // replica group members (including itself in first slot) + pthread_t thread_id; // driver thread id + nsr_recon_driver_ctx_t *driver_thread_context; //driver thread context + unsigned int outstanding; // for communicating with driver thread + call_frame_t *frame; // old frame that is pending (just one as of now) + struct list_head list; + char *volname; + uint32_t txn_id; + char *changelog_base_path; + char *base_dir; +#if defined(NSR_DEBUG) + FILE *fp; +#endif +} nsr_recon_private_t; + +#define atomic_cmpxchg __sync_val_compare_and_swap + +#if defined(NSR_DEBUG) + +extern void +_recon_main_log (const char *func, int line, char *member, FILE *fp, + char *fmt, ...); + +#define recon_main_log(dom, levl, fmt...) do { \ + FMT_WARN (fmt); \ + if (levl <= nsr_debug_level) { \ + nsr_recon_private_t *priv = this->private; \ + _recon_main_log (__FUNCTION__, __LINE__, \ + priv->replica_group_members[0], \ + priv->fp, \ + ##fmt); \ + } \ +} while (0) + +#else +#define recon_main_log(dom, levl, fmt...) gf_log(dom, levl, fmt) +#endif + +void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt); +void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt); +void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t term, int32_t status, int32_t op_errno); +gf_boolean_t nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf); + + +#endif /* #ifndef __RECON_XLATOR_H__ */ diff --git a/xlators/cluster/nsr-server/Makefile.am b/xlators/cluster/nsr-server/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/nsr-server/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/nsr-server/src/Makefile.am b/xlators/cluster/nsr-server/src/Makefile.am new file mode 100644 index 000000000..0092aad4f --- /dev/null +++ b/xlators/cluster/nsr-server/src/Makefile.am @@ -0,0 +1,43 @@ +noinst_PYTHON = codegen.py gen-fops.py + +xlator_LTLIBRARIES = nsr.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +nsr_la_LDFLAGS = -module -avoid-version -lcurl + +if ENABLE_ETCD_SIM +nsr_la_SOURCES = nsr.c leader.c recon_notify.c etcd-sim.c +else +nsr_la_SOURCES = nsr.c leader.c recon_notify.c etcd-api.c \ + yajl.c yajl_alloc.c yajl_buf.c yajl_encode.c yajl_gen.c \ + yajl_lex.c yajl_parser.c yajl_tree.c yajl_version.c +endif + + +nsr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/api/src/libgfapi.la + +noinst_HEADERS = nsr-internal.h etcd-api.h all-templates.c \ + yajl_alloc.h yajl_buf.h yajl_bytestack.h yajl_encode.h \ + yajl_lex.h yajl_parser.h yajl/yajl_common.h yajl/yajl_gen.h \ + yajl/yajl_parse.h yajl/yajl_tree.h yajl/yajl_version.h \ + $(top_srcdir)/xlators/lib/src/libxlator.h \ + $(top_srcdir)/glusterfsd/src/glusterfsd.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src -DSBIN_DIR=\"$(sbindir)\" + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h + +CLEANFILES = nsr-cg.c + +nsr-cg.c: gen-fops.py codegen.py $(XLATOR_HEADER) all-templates.c + $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) all-templates.c > $@ + +nsr.lo: nsr-cg.c + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/nsr.so diff --git a/xlators/cluster/nsr-server/src/all-templates.c b/xlators/cluster/nsr-server/src/all-templates.c new file mode 100644 index 000000000..fa29de7b2 --- /dev/null +++ b/xlators/cluster/nsr-server/src/all-templates.c @@ -0,0 +1,345 @@ +/* + * You can put anything here - it doesn't even have to be a comment - and it + * will be ignored until we reach the first template-name comment. + */ + + +// template-name read-fop +$TYPE$ +nsr_$NAME$ (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_$NAME$_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$, + $ARGS_SHORT$); + return 0; + +err: + STACK_UNWIND_STRICT ($NAME$, frame, -1, EREMOTE, + $DEFAULTS$); + return 0; +} + +// template-name read-dispatch +/* No "dispatch" function needed for $NAME$ */ + +// template-name read-fan-in +/* No "fan-in" function needed for $NAME$ */ + +// template-name read-continue +/* No "continue" function needed for $NAME$ */ + +// template-name read-complete +/* No "complete" function needed for $NAME$ */ + +// template-name write-fop +$TYPE$ +nsr_$NAME$ (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + uint32_t ti = 0; + double must_be_up; + double are_up; + + /* + * Our first goal here is to avoid "split brain surprise" for users who + * specify exactly 50% with two- or three-way replication. That means + * either a more-than check against half the total replicas or an + * at-least check against half of our peers (one less). Of the two, + * only an at-least check supports the intuitive use of 100% to mean + * all replicas must be present, because "more than 100%" will never + * succeed regardless of which count we use. This leaves us with a + * slightly non-traditional definition of quorum ("at least X% of peers + * not including ourselves") but one that's useful enough to be worth + * it. + * + * Note that n_children and up_children *do* include the local + * subvolume, so we need to subtract one in each case. + */ + must_be_up = ((double)(priv->n_children - 1)) * priv->quorum_pct; + are_up = ((double)(priv->up_children - 1)) * 100.0; + if (are_up < must_be_up) { + /* Emulate the AFR client-side-quorum behavior. */ + op_errno = EROFS; + goto err; + } + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if defined(NSR_CG_NEED_FD) + local->fd = fd_ref(fd); +#else + local->fd = NULL; +#endif + INIT_LIST_HEAD(&local->qlinks); + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + atomic_inc(&priv->ops_in_flight); + STACK_WIND (frame, nsr_$NAME$_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$, + $ARGS_SHORT$); + return 0; + } + + + if (!priv->leader/* || priv->fence_io*/) { + op_errno = EREMOTE; + goto err; + } + + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + LOCK(&priv->index_lock); + ti = ++(priv->index); + UNLOCK(&priv->index_lock); + if (dict_set_int32(xdata,NSR_INDEX_XATTR,ti) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set index"); + goto err; + } + + local->stub = fop_$NAME$_stub (frame,nsr_$NAME$_continue, + $ARGS_SHORT$); + if (!local->stub) { + goto err; + } + + +#if defined(NSR_CG_QUEUE) + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd->inode); + if (!ictx) { + op_errno = EIO; + goto err; + } + LOCK(&ictx->lock); + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + /* + * TBD: enqueue only for real conflict + * + * Currently we just act like all writes are in + * conflict with one another. What we should really do + * is check the active/pending queues and defer only if + * there's a conflict there. + * + * It's important to check the pending queue because we + * might have an active request X which conflicts with + * a pending request Y, and this request Z might + * conflict with Y but not X. If we checked only the + * active queue then Z could jump ahead of Y, which + * would be incorrect. + */ + local->qstub = fop_$NAME$_stub (frame, + nsr_$NAME$_dispatch, + $ARGS_SHORT$); + if (!local->qstub) { + UNLOCK(&ictx->lock); + goto err; + } + list_add_tail(&local->qlinks,&ictx->pqueue); + ++(ictx->pending); + UNLOCK(&ictx->lock); + return 0; + } + else { + list_add_tail(&local->qlinks,&ictx->aqueue); + ++(ictx->active); + } + UNLOCK(&ictx->lock); +#endif + + return nsr_$NAME$_dispatch (frame, this, $ARGS_SHORT$); + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->qstub) { + call_stub_destroy(local->qstub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT ($NAME$, frame, -1, op_errno, + $DEFAULTS$); + return 0; +} + +// template-name write-dispatch +$TYPE$ +nsr_$NAME$_dispatch (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsr_local_t *local = frame->local; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + + atomic_inc(&priv->ops_in_flight); + + /* + * TBD: unblock pending request(s) if we fail after this point but + * before we get to nsr_$NAME$_complete (where that code currently + * resides). + */ + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_$NAME$_fan_in, + trav->xlator, trav->xlator->fops->$NAME$, + $ARGS_SHORT$); + } + + // TBD: variable Issue count + return 0; +} + +// template-name write-fan-in +$TYPE$ +nsr_$NAME$_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + $ARGS_LONG$) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +// template-name write-continue +$TYPE$ +nsr_$NAME$_continue (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + STACK_WIND (frame, nsr_$NAME$_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$, + $ARGS_SHORT$); + return 0; +} + +// template-name write-complete +$TYPE$ +nsr_$NAME$_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + $ARGS_LONG$) +{ + nsr_private_t *priv = this->private; +#if defined(NSR_CG_NEED_FD) + nsr_local_t *local = frame->local; +#endif + +#if defined(NSR_CG_QUEUE) + nsr_inode_ctx_t *ictx; + nsr_local_t *next; + if (local->qlinks.next != &local->qlinks) { + list_del(&local->qlinks); + ictx = nsr_get_inode_ctx(this,local->fd->inode); + if (ictx) { + LOCK(&ictx->lock); + if (ictx->pending) { + /* + * TBD: dequeue *all* non-conflicting reqs + * + * With the stub implementation there can only + * be one request active at a time (zero here) + * so it's not an issue. In a real + * implementation there might still be other + * active requests to check against, and + * multiple pending requests that could + * continue. + */ + gf_log (this->name, GF_LOG_DEBUG, + "unblocking next request"); + --(ictx->pending); + next = list_entry (ictx->pqueue.next, + nsr_local_t, qlinks); + list_del(&next->qlinks); + list_add_tail(&next->qlinks,&ictx->aqueue); + call_resume(next->qstub); + } + else { + --(ictx->active); + } + UNLOCK(&ictx->lock); + } + } +#endif + +#if defined(NSR_CG_FSYNC) + nsr_mark_fd_dirty(this,local); +#endif + +#if defined(NSR_CG_NEED_FD) + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno, + $ARGS_SHORT$); + atomic_dec(&priv->ops_in_flight); + return 0; + +} diff --git a/xlators/cluster/nsr-server/src/codegen.py b/xlators/cluster/nsr-server/src/codegen.py new file mode 100755 index 000000000..709f5662f --- /dev/null +++ b/xlators/cluster/nsr-server/src/codegen.py @@ -0,0 +1,174 @@ +#!/usr/bin/python + +# This module lets us auto-generate boilerplate versions of fops and cbks, +# both for the client side and (eventually) on the server side as well. This +# allows us to implement common logic (e.g. leader fan-out and sequencing) +# once, without all the problems that come with copying and pasting the same +# code into dozens of functions (or failing to). +# +# I've tried to make this code pretty generic, since it's already likely to +# be used multiple ways within NSR. Really, we should use something like this +# to generate defaults.[ch] as well, to avoid the same sorts of mismatches +# that we've already seen and to which this approach makes NSR immune. That +# would require using something other than defaults.h as the input, but that +# format could be even simpler so that's a good thing too. + + +import re +import sys + +decl_re = re.compile("([a-z0-9_]+)$") +tmpl_re = re.compile("// template-name (.*)") + +class CodeGenerator: + + def __init__ (self): + self.decls = {} + self.skip = 0 + self.templates = {} + self.make_defaults = self._make_defaults + + # Redefine this to preprocess the name in a declaration, e.g. + # fop_lookup_t => nsrc_lookup + def munge_name (self, orig): + return orig + + # By default, this will convert the argument string into a sequence of + # (type, name) tuples minus the first self.skip (default zero) arguments. + # You can redefine it to skip the conversion, do a different conversion, + # or rearrange the arguments however you like. + def munge_args (self, orig): + args = [] + for decl in orig.strip("(); ").split(","): + m = decl_re.search(decl) + if m: + args.append((m.group(1),decl[:m.start(1)].strip())) + else: + raise RuntimeError("can't split %s into type+name"%decl) + return args[self.skip:] + + def add_decl (self, fname, ftype, fargs): + self.decls[self.munge_name(fname)] = (ftype, self.munge_args(fargs)) + + def parse_decls (self, path, pattern): + regex = re.compile(pattern) + f = open(path,"r") + have_decl = False + while True: + line = f.readline() + if not line: + break + m = regex.search(line) + if m: + if have_decl: + self.add_decl(f_name,f_type,f_args) + f_name = m.group(2) + f_type = m.group(1) + f_args = line[m.end(0):-1].strip() + if f_args.rfind(")") >= 0: + self.add_decl(f_name,f_type,f_args) + else: + have_decl = True + elif have_decl: + if line.strip() == "": + self.add_decl(f_name,f_type,f_args) + have_decl = False + else: + f_args += " " + f_args += line[:-1].strip() + if have_decl: + self.add_decl(f_name,f_type,f_args) + + # Legacy function (yeah, already) to load a single template. If you're + # using multiple templates, you're better off loading them all from one + # file using load_templates (note plural) instead. + def load_template (self, name, path): + self.templates[name] = open(path,"r").readlines() + + # Load multiple templates. Each is introduced by a special comment of + # the form + # + # // template-name xyz + # + # One side effect is that the block before the first such comment will be + # ignored. This seems like it might be useful some day so I'll leave it + # in, but if people trip over it maybe it will change. + # + # It is recommended to define templates in expected execution order, to + # make the result more readable than the inverted order (e.g. callback + # then fop) common in the rest of our code. + def load_templates (self, path): + t_name = None + for line in open(path,"r").readlines(): + if not line: + break + m = tmpl_re.match(line) + if m: + if t_name: + self.templates[t_name] = t_contents + t_name = m.group(1).strip() + t_contents = [] + elif t_name: + t_contents.append(line) + if t_name: + self.templates[t_name] = t_contents + + # Emit the template, with the following expansions: + # + # $NAME$ => function name (as passed in) + # $TYPE$ => function return value + # $ARGS_SHORT$ => argument list, including types + # $ARGS_LONG$ => argument list, *not* including types + # $DEFAULTS$ => default callback args (see below) + # + # The $DEFAULTS$ substitution is for the case where a fop (which has one + # set of arguments) needs to signal an error via STACK_UNWIND (which + # requires a different set of arguments). In this case we look up the + # argument list for the opposite direction, using self.make_defaults which + # the user must explicitly set to the method for the opposite direction. + # If an argument is a pointer, we replace it with NULL; otherwise we + # replace it with zero. It's a hack, but it's the only thing we do that + # doesn't require specific knowledge of our environment and the specific + # call we're handling. If this doesn't suffice, we'll have to add + # something like $ARG0$ which can be passed in for specific cases. + def emit (self, f_name, tmpl): + args = self.decls[f_name][1] + zipper = lambda x: x[0] + a_short = ", ".join(map(zipper,args)) + zipper = lambda x: x[1] + " " + x[0] + a_long = ", ".join(map(zipper,args)) + for line in self.templates[tmpl]: + line = line.replace("$NAME$",f_name) + line = line.replace("$TYPE$",self.decls[f_name][0]) + line = line.replace("$ARGS_SHORT$",a_short) + line = line.replace("$ARGS_LONG$",a_long) + line = line.replace("$DEFAULTS$",self.make_defaults(f_name)) + print(line.rstrip()) + + def _make_defaults (self, f_name): + result = [] + for arg in self.decls[f_name][1]: + if arg[1][-1] == "*": + result.append("NULL") + else: + result.append("0") + return ", ".join(result) + +if __name__ == "__main__": + type_re = "([a-z_0-9]+)" + name_re = "\(\*fop_([a-z0-9]+)_t\)" + full_re = type_re + " *" + name_re + cg = CodeGenerator() + cg.skip = 2 + cg.parse_decls(sys.argv[1],full_re) + """ + for k, v in cg.decls.iteritems(): + print("=== %s" % k) + print(" return type %s" % v[0]) + for arg in v[1]: + print(" arg %s (type %s)" % arg) + """ + cg.load_template("fop",sys.argv[2]) + cg.emit("lookup","fop") + cg.emit("rename","fop") + cg.emit("setxattr","fop") diff --git a/xlators/cluster/nsr-server/src/etcd-api.c b/xlators/cluster/nsr-server/src/etcd-api.c new file mode 100644 index 000000000..a07019244 --- /dev/null +++ b/xlators/cluster/nsr-server/src/etcd-api.c @@ -0,0 +1,831 @@ +/* + * Copyright (c) 2013, Red Hat + * All rights reserved. + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* For asprintf */ +#if !defined(_GNU_SOURCE) +#define _GNU_SOURCE +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <curl/curl.h> +#include <yajl/yajl_tree.h> +#include "etcd-api.h" + + +#define DEFAULT_ETCD_PORT 4001 +#define SL_DELIM "\n\r\t ,;" + +typedef struct { + etcd_server *servers; +} _etcd_session; + +typedef struct { + char *key; + char *value; + int *index_in; /* pointer so NULL can be special */ + int index_out; /* NULL would be meaningless */ +} etcd_watch_t; + +typedef size_t curl_callback_t (void *, size_t, size_t, void *); + +int g_inited = 0; +const char *value_path[] = { "node", "value", NULL }; +const char *nodes_path[] = { "node", "nodes", NULL }; +const char *entry_path[] = { "key", NULL }; + +/* + * We only call this in case where it should be safe, but gcc doesn't know + * that so we use this to shut it up. + */ +char * +MY_YAJL_GET_STRING (yajl_val x) +{ + char *y = YAJL_GET_STRING(x); + + return y ? y : "bogus"; +} + +#if defined(DEBUG) +void +print_curl_error (char *intro, CURLcode res) +{ + printf("%s: %s\n",intro,curl_easy_strerror(res)); +} +#else +#define print_curl_error(intro,res) +#endif + + +etcd_session +etcd_open (etcd_server *server_list) +{ + _etcd_session *session; + + if (!g_inited) { + curl_global_init(CURL_GLOBAL_ALL); + g_inited = 1; + } + + session = malloc(sizeof(*session)); + if (!session) { + return NULL; + } + + /* + * Some day we'll set up more persistent connections, and keep track + * (via redirects) of which server is leader so that we can always + * try it first. For now we just push that to the individual request + * functions, which do the most brain-dead thing that can work. + */ + + session->servers = server_list; + return session; +} + + +void +etcd_close (etcd_session session) +{ + free(session); +} + +/* + * Normal yajl_tree_get is returning NULL for these paths even when I can + * verify (in gdb) that they exist. I suppose I could debug this for them, but + * this is way easier. + * + * TBD: see if common distros are packaging a JSON library that isn't total + * crap. + */ +yajl_val +my_yajl_tree_get (yajl_val root, char const **path, yajl_type type) +{ + yajl_val obj = root; + int i; + + for (;;) { + if (!*path) { + if (obj && (obj->type != type)) { + return NULL; + } + return obj; + } + if (obj->type != yajl_t_object) { + return NULL; + } + for (i = 0; /* nothing */; ++i) { + if (i >= obj->u.object.len) { + return NULL; + } + if (!strcmp(obj->u.object.keys[i],*path)) { + obj = obj->u.object.values[i]; + ++path; + break; + } + } + } +} + + +/* + * Looking directly at node->u.array seems terribly un-modular, but the YAJL + * tree interface doesn't seem to have any exposed API for iterating over the + * elements of an array. I tried using yajl_tree_get with an index in the + * path, either as a type-casted integer or as a string, but that didn't work. + */ +char * +parse_array_response (yajl_val parent) +{ + size_t i; + yajl_val item; + yajl_val value; + char *retval = NULL; + char *saved; + yajl_val node; + + node = my_yajl_tree_get(parent,nodes_path,yajl_t_array); + if (!node) { + return NULL; + } + + for (i = 0; i < node->u.array.len; ++i) { + item = node->u.array.values[i]; + if (!item) { + break; + } + value = my_yajl_tree_get(item,entry_path,yajl_t_string); + if (!value) { + break; + } + if (retval) { + saved = retval; + retval = NULL; + (void)asprintf (&retval, "%s\n%s", + saved, MY_YAJL_GET_STRING(value)); + free(saved); + } + else { + retval = strdup(MY_YAJL_GET_STRING(value)); + } + if (!retval) { + break; + } + } + + return retval; +} + +size_t +parse_get_response (void *ptr, size_t size, size_t nmemb, void *stream) +{ + yajl_val node; + yajl_val value; + + node = yajl_tree_parse(ptr,NULL,0); + if (node) { + value = my_yajl_tree_get(node,value_path,yajl_t_string); + if (value) { + /* + * YAJL probably copied it once, now we're going to + * copy it again. If anybody really cares for such + * small and infrequently used values, we'd have to do + * do something much more complicated (like using the + * stream interface) to avoid the copy. Right now it's + * just not worth it. + */ + *((char **)stream) = strdup(MY_YAJL_GET_STRING(value)); + } + else { + /* Might as well try this. */ + *((char **)stream) = parse_array_response(node); + } + yajl_tree_free(node); + } + + return size*nmemb; +} + + +etcd_result +etcd_get_one (_etcd_session *session, char *key, etcd_server *srv, char *prefix, + char *post, curl_callback_t cb, char **stream) +{ + char *url; + CURL *curl; + CURLcode curl_res; + etcd_result res = ETCD_WTF; + void *err_label = &&done; + + if (asprintf(&url,"http://%s:%u/v2/%s%s", + srv->host,srv->port,prefix,key) < 0) { + goto *err_label; + } + printf("url = %s\n",url); + err_label = &&free_url; + + curl = curl_easy_init(); + if (!curl) { + goto *err_label; + } + err_label = &&cleanup_curl; + + /* TBD: add error checking for these */ + curl_easy_setopt(curl,CURLOPT_URL,url); + curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L); + curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,cb); + curl_easy_setopt(curl,CURLOPT_WRITEDATA,stream); + if (post) { + curl_easy_setopt(curl,CURLOPT_POST,1L); + curl_easy_setopt(curl,CURLOPT_POSTFIELDS,post); + } +#if defined(DEBUG) + curl_easy_setopt(curl,CURLOPT_VERBOSE,1L); +#endif + + curl_res = curl_easy_perform(curl); + if (curl_res != CURLE_OK) { + print_curl_error("perform",curl_res); + goto *err_label; + } + + res = ETCD_OK; + +cleanup_curl: + curl_easy_cleanup(curl); +free_url: + free(url); +done: + return res; +} + + +char * +etcd_get (etcd_session session_as_void, char *key) +{ + _etcd_session *session = session_as_void; + etcd_server *srv; + etcd_result res; + char *value = NULL; + + for (srv = session->servers; srv->host; ++srv) { + res = etcd_get_one(session,key,srv,"keys/",NULL, + parse_get_response,&value); + if ((res == ETCD_OK) && value) { + return value; + } + } + + return NULL; +} + + +size_t +parse_watch_response (void *ptr, size_t size, size_t nmemb, void *stream) +{ + yajl_val node; + yajl_val value; + etcd_watch_t *watch = stream; + static const char *i_path[] = { "node", "modifiedIndex", NULL }; + static const char *k_path[] = { "node", "key", NULL }; + static const char *v_path[] = { "node", "value", NULL }; + + node = yajl_tree_parse(ptr,NULL,0); + if (node) { + value = my_yajl_tree_get(node,i_path,yajl_t_number); + if (value) { + watch->index_out = strtoul(YAJL_GET_NUMBER(value), + NULL,10); + } + value = my_yajl_tree_get(node,k_path,yajl_t_string); + if (value) { + watch->key = strdup(MY_YAJL_GET_STRING(value)); + } + value = my_yajl_tree_get(node,v_path,yajl_t_string); + if (value) { + watch->value = strdup(MY_YAJL_GET_STRING(value)); + } + } + + return size*nmemb; +} + + +etcd_result +etcd_watch (etcd_session session_as_void, char *pfx, + char **keyp, char **valuep, int *index_in, int *index_out) +{ + _etcd_session *session = session_as_void; + etcd_server *srv; + etcd_result res; + etcd_watch_t watch; + char *path; + + if (index_in) { + if (asprintf(&path,"%s?wait=true&recursive=true&waitIndex=%d", + pfx,*index_in) < 0) { + return ETCD_WTF; + } + } + else { + if (asprintf(&path,"%s?wait=true&recursive=true",pfx) < 0) { + return ETCD_WTF; + } + } + + memset(&watch,0,sizeof(watch)); + watch.index_in = index_in; + + for (srv = session->servers; srv->host; ++srv) { + res = etcd_get_one(session,path,srv,"keys/",NULL, + parse_watch_response,(char **)&watch); + if (res == ETCD_OK) { + if (keyp) { + *keyp = watch.key; + } + if (valuep) { + *valuep = watch.value; + } + if (index_out) { + *index_out = watch.index_out; + } + break; + } + } + + free(path); + return res; +} + + +size_t +parse_set_response (void *ptr, size_t size, size_t nmemb, void *stream) +{ + yajl_val node; + yajl_val value; + etcd_result res = ETCD_PROTOCOL_ERROR; + /* + * Success responses contain prevValue and index. Failure responses + * contain errorCode and cause. Among all these, index seems to be the + * one we're most likely to need later, so look for that. + */ + static const char *path[] = { "node", "modifiedIndex", NULL }; + + node = yajl_tree_parse(ptr,NULL,0); + if (node) { + value = my_yajl_tree_get(node,path,yajl_t_number); + if (value) { + res = ETCD_OK; + } + } + + *((etcd_result *)stream) = res; + return size*nmemb; +} + + +size_t +parse_lock_response (void *ptr, size_t size, size_t nmemb, void *stream) +{ + *((char **)stream) = strdup(ptr); + return size*nmemb; +} + + +/* + * There are two use cases, based on is_lock. + * + * If is_lock is null, we use the "keys" namespace. A null value means an + * HTTP DELETE; precond and ttl are both ignored. Otherwise we're setting a + * value, with *optional* precond and ttl. + * + * If is_lock is set, we use the "locks" namespace. A null value means an + * HTTP DELETE as before, and we still ignore ttl as before, but now precond + * must be set to represent the lock index. Otherwise ttl must be present, + * and we decide what to do based on precond. If it's null, this is an + * initial lock so we use an HTTP POST. Otherwise it's a renewal so we use + * an HTTP PUT instead. + */ +etcd_result +etcd_set_one (_etcd_session *session, char *key, char *value, + char *precond, unsigned int ttl, etcd_server *srv, + char **is_lock) +{ + char *url; + char *contents = NULL; + CURL *curl; + etcd_result res = ETCD_WTF; + CURLcode curl_res; + void *err_label = &&done; + char *namespace; + char *http_cmd; + char *orig_index; + + if (is_lock) { + namespace = "mod/v2/lock"; + if (value) { + if (!ttl) { + /* Lock/renew must specify ttl. */ + return ETCD_WTF; + } + http_cmd = precond ? "PUT" : "POST"; + } + else { + if (!precond) { + /* Unlock must specify index. */ + return ETCD_WTF; + } + http_cmd = "DELETE"; + } + orig_index = *is_lock; + } + else { + namespace = "v2/keys"; + http_cmd = value ? "PUT" : "DELETE"; + } + + if (asprintf(&url,"http://%s:%u/%s/%s", + srv->host,srv->port,namespace,key) < 0) { + goto *err_label; + } + err_label = &&free_url; + + if (is_lock) { + if (precond) { + if (asprintf(&contents,"index=%s",precond) < 0) { + goto *err_label; + } + err_label = &&free_contents; + } + if (ttl) { + if (contents) { + char *c2; + if (asprintf(&c2,"ttl=%u;%s",ttl,contents) < 0) { + goto *err_label; + } + free(contents); + contents = c2; + } + else { + if (asprintf(&contents,"ttl=%u",ttl) < 0) { + goto *err_label; + } + } + err_label = &&free_contents; + } + } + else { + if (value) { + if (asprintf(&contents,"value=%s",value) < 0) { + goto *err_label; + } + err_label = &&free_contents; + } + if (precond) { + char *c2; + if (asprintf(&c2,"%s;prevValue=%s",contents, + precond) < 0) { + goto *err_label; + } + free(contents); + contents = c2; + err_label = &&free_contents; + } + if (ttl) { + char *c2; + if (asprintf(&c2,"%s;ttl=%u",contents,ttl) < 0) { + goto *err_label; + } + free(contents); + contents = c2; + err_label = &&free_contents; + } + } + + curl = curl_easy_init(); + if (!curl) { + goto *err_label; + } + err_label = &&cleanup_curl; + + /* TBD: add error checking for these */ + curl_easy_setopt(curl,CURLOPT_CUSTOMREQUEST,http_cmd); + curl_easy_setopt(curl,CURLOPT_URL,url); + curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L); + curl_easy_setopt(curl,CURLOPT_POSTREDIR,CURL_REDIR_POST_ALL); + + if (is_lock && value && !precond) { + /* Only do this for an initial lock, not a renewal. */ + curl_easy_setopt (curl, CURLOPT_WRITEFUNCTION, + parse_lock_response); + curl_easy_setopt(curl,CURLOPT_WRITEDATA,is_lock); + } + else { + curl_easy_setopt (curl, CURLOPT_WRITEFUNCTION, + parse_set_response); + curl_easy_setopt(curl,CURLOPT_WRITEDATA,&res); + } + + /* + * CURLOPT_HTTPPOST would be easier, but it looks like etcd will barf on + * that. Sigh. + */ + if (contents) { + curl_easy_setopt(curl,CURLOPT_POST,1L); + curl_easy_setopt(curl,CURLOPT_POSTFIELDS,contents); + } +#if defined(DEBUG) + curl_easy_setopt(curl,CURLOPT_VERBOSE,1L); +#endif + + curl_res = curl_easy_perform(curl); + if (curl_res != CURLE_OK) { + print_curl_error("perform",curl_res); + goto *err_label; + } + + if (is_lock && value) { + if (!precond) { + /* + * If this is an initial lock, parse_lock_response would + * have been unable to set "res" for us. Instead, we + * set it here if the index string got updated. + */ + if (*is_lock != orig_index) { + res = ETCD_OK; + } + } + else { + /* + * If this is a lock renewal, then a successful call + * will pass through neither parse_lock_response nor + * parse_get_response. The curl response code alone + * is sufficient. + */ + res = ETCD_OK; + } + } + + /* + * If the request succeeded, or at least got to the server and failed + * there, parse_set_response should have set res appropriately. + */ + +cleanup_curl: + curl_easy_cleanup(curl); +free_contents: + free(contents); /* might already be NULL for delete, but that's OK */ +free_url: + free(url); +done: + return res; +} + + +etcd_result +etcd_set (etcd_session session_as_void, char *key, char *value, + char *precond, unsigned int ttl) +{ + _etcd_session *session = session_as_void; + etcd_server *srv; + etcd_result res; + + for (srv = session->servers; srv->host; ++srv) { + res = etcd_set_one(session,key,value,precond,ttl,srv,NULL); + /* + * Protocol errors are likely to be things like precondition + * failures, which won't be helped by retrying on another + * server. + */ + if ((res == ETCD_OK) || (res == ETCD_PROTOCOL_ERROR)) { + return res; + } + } + + return ETCD_WTF; +} + + +/* + * This uses the same path and status checks as SET, but with a different HTTP + * command instead of data. Precondition and TTL are obviously not used in + * this case, though a conditional delete would be a cool feature for etcd. I + * think you can get a timed delete by doing a conditional set to the current + * value with a TTL, but I haven't actually tried it. + */ +etcd_result +etcd_delete (etcd_session session_as_void, char *key) +{ + _etcd_session *session = session_as_void; + etcd_server *srv; + etcd_result res; + + for (srv = session->servers; srv->host; ++srv) { + res = etcd_set_one(session,key,NULL,NULL,0,srv,NULL); + if (res == ETCD_OK) { + break; + } + } + + return res; +} + + +etcd_result +etcd_lock (etcd_session session_as_void, char *key, unsigned int ttl, + char *index_in, char **index_out) +{ + _etcd_session *session = session_as_void; + etcd_server *srv; + etcd_result res; + char *tmp = NULL; + + for (srv = session->servers; srv->host; ++srv) { + res = etcd_set_one(session,key,"hack",index_in,ttl,srv,&tmp); + if (res == ETCD_OK) { + if (index_out) { + *index_out = tmp; + } + break; + } + } + + return res; +} + + +etcd_result +etcd_unlock (etcd_session session_as_void, char *key, char *index) +{ + _etcd_session *session = session_as_void; + etcd_server *srv; + etcd_result res; + char *tmp = NULL; + + for (srv = session->servers; srv->host; ++srv) { + res = etcd_set_one(session,key,NULL,index,0,srv,&tmp); + if (res == ETCD_OK) { + break; + } + } + + return res; +} +size_t +store_leader (void *ptr, size_t size, size_t nmemb, void *stream) +{ + *((char **)stream) = strdup(ptr); + return size * nmemb; +} + + +char * +etcd_leader (etcd_session session_as_void) +{ + _etcd_session *session = session_as_void; + etcd_server *srv; + etcd_result res; + char *value = NULL; + + for (srv = session->servers; srv->host; ++srv) { + res = etcd_get_one(session,"leader",srv,"",NULL, + store_leader,&value); + if ((res == ETCD_OK) && value) { + return value; + } + } + + return NULL; +} + + +void +free_sl (etcd_server *server_list) +{ + size_t num_servers; + + for (num_servers = 0; server_list[num_servers].host; ++num_servers) { + free(server_list[num_servers].host); + } + free(server_list); +} + + +int +_count_matching (char *text, char *cset, int result) +{ + char *t; + int res = 0; + + for (t = text; *t; ++t) { + if ((strchr(cset,*t) != NULL) != result) { + break; + } + ++res; + } + + return res; +} + +#define count_matching(t,cs) _count_matching(t,cs,1) +#define count_nonmatching(t,cs) _count_matching(t,cs,0) + + +etcd_session +etcd_open_str (char *server_names) +{ + char *snp; + int run_len; + int host_len; + size_t num_servers; + etcd_server *server_list; + etcd_session *session; + + /* + * Yeah, we iterate over the string twice so we can allocate an + * appropriately sized array instead of turning it into a linked list. + * Unfortunately this means we can't use strtok* which is destructive + * with no platform-independent way to reverse the destructive effects. + */ + + num_servers = 0; + snp = server_names; + while (*snp) { + run_len = count_nonmatching(snp,SL_DELIM); + if (!run_len) { + snp += count_matching(snp,SL_DELIM); + continue; + } + ++num_servers; + snp += run_len; + } + + if (!num_servers) { + return NULL; + } + + server_list = calloc(num_servers+1,sizeof(*server_list)); + if (!server_list) { + return NULL; + } + num_servers = 0; + + snp = server_names; + while (*snp) { + run_len = count_nonmatching(snp,SL_DELIM); + if (!run_len) { + snp += count_matching(snp,SL_DELIM); + continue; + } + host_len = count_nonmatching(snp,":"); + if ((run_len - host_len) > 1) { + server_list[num_servers].host = strndup(snp,host_len); + server_list[num_servers].port = (unsigned short) + strtoul(snp+host_len+1,NULL,10); + } + else { + server_list[num_servers].host = strndup(snp,run_len); + server_list[num_servers].port = DEFAULT_ETCD_PORT; + } + ++num_servers; + snp += run_len; + } + + session = etcd_open(server_list); + if (!session) { + free_sl(server_list); + } + return session; +} + + +void +etcd_close_str (etcd_session session) +{ + free_sl(((_etcd_session *)session)->servers); + etcd_close(session); +} diff --git a/xlators/cluster/nsr-server/src/etcd-api.h b/xlators/cluster/nsr-server/src/etcd-api.h new file mode 100644 index 000000000..66275d40d --- /dev/null +++ b/xlators/cluster/nsr-server/src/etcd-api.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2013, Red Hat + * All rights reserved. + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Description of an etcd server. For now it just includes the name and + * port, but some day it might include other stuff like SSL certificate + * information. + */ + +typedef enum { + ETCD_OK = 0, + ETCD_PROTOCOL_ERROR, + /* TBD: add other error categories here */ + ETCD_WTF /* anything we can't easily categorize */ +} etcd_result; + +typedef struct { + char *host; + unsigned short port; +} etcd_server; + +typedef void *etcd_session; + +/* + * etcd_open + * + * Establish a session to an etcd cluster, with automatic reconnection and + * so on. + * + * server_list + * Array of etcd_server structures, with the last having host=NULL. The + * caller is responsible for ensuring that this remains valid as long as + * the session exists. + */ +etcd_session etcd_open (etcd_server *server_list); + + +/* + * etcd_open_str + * + * Same as etcd_open, except that the servers are specified as a list of + * host:port strings, separated by comma/semicolon or whitespace. + */ +etcd_session etcd_open_str (char *server_names); + + +/* + * etcd_close + * + * Terminate a session, closing connections and freeing memory (or any other + * resources) associated with it. + */ +void etcd_close (etcd_session session); + + +/* + * etcd_close + * + * Same as etcd_close, but also free the server list as etcd_open_str would + * have allocated it. + */ +void etcd_close_str (etcd_session session); + + +/* + * etcd_get + * + * Fetch a key from one of the servers in a session. The return value is a + * newly allocated string, which must be freed by the caller. + * + * key + * The etcd key (path) to fetch. + */ +char * etcd_get (etcd_session session, char *key); + + +/* + * etcd_watch + * Watch the set of keys matching a prefix. + * + * pfx + * The etcd key prefix (like a path) to watch. + * + * keyp + * Space for a pointer to the key that was added/modified/deleted. + * + * valuep + * Space for a pointer to the value if a key was added/modified. A delete + * is signified by this being set to NULL. + * + * index_in + * Pointer to an index to be used for *issuing* the watch request, or + * NULL for a watch without an index. + * + * index_out + * Pointer to space for an index *returned* by etcd, or NULL to mean don't + * bother. + * + * In normal usage, index_in will be NULL and index_out will be set to receive + * the index for the first watch. Subsequently, index_in will be set to + * provide the previous index (plus one) and index_out will be set to receive + * the next. It's entirely legitimate to point both at the same variable. + */ + +etcd_result etcd_watch (etcd_session session, char *pfx, + char **keyp, char **valuep, + int *index_in, int *index_out); + + +/* + * etcd_set + * + * Write a key, with optional TTL and/or previous value (as a precondition). + * + * key + * The etcd key (path) to set. + * + * value + * New value as a null-terminated string. Unlike etcd_get, we can derive + * the length ourselves instead of needing it to be passed in separately. + * + * precond + * Required previous value as a null-terminated string, or NULL to mean + * an unconditional set. + * + * ttl + * Time in seconds after which the value will automatically expire and be + * deleted, or zero to mean no auto-expiration. + */ + +etcd_result etcd_set (etcd_session session, char *key, char *value, + char *precond, unsigned int ttl); + + +/* + * etcd_delete + * + * Delete a key from one of the servers in a session. + * + * key + * The etcd key (path) to delete. + */ + +etcd_result etcd_delete (etcd_session session, char *key); + + +/* + * etcd_leader + * + * Get the identify of the current leader. + */ + +char * etcd_leader (etcd_session session); + +/* + * etcd_lock + * + * Take or renew a lock - really a lease but the etcd folks call it a lock so + * we'll follow suit. + * + * key + * The path (in the "locks" namespace) for the lock. + * + * ttl + * Time in seconds for the lock. + * + * index_in (optional, indicates renewal) + * Lock index from previous lock call. + * + * index_out (only used for initial lock) + * Place for the new lock index. You must free this. + */ + +etcd_result etcd_lock (etcd_session session_as_void, char *key, + unsigned int ttl, char *index_in, char **index_out); + +/* + * etcd_unlock + * + * Release a lock (see etcd_lock regarding terminology). + * + * key + * The path (in the "locks" namespace) for the lock. + * + * index + * Lock index from previous lock call. + */ + +etcd_result etcd_unlock (etcd_session session_as_void, char *key, + char *index); + diff --git a/xlators/cluster/nsr-server/src/etcd-sim.c b/xlators/cluster/nsr-server/src/etcd-sim.c new file mode 100644 index 000000000..d0bea12c7 --- /dev/null +++ b/xlators/cluster/nsr-server/src/etcd-sim.c @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2014, Red Hat + * All rights reserved. + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/file.h> + +#include "mem-pool.h" + +/* + * Mock implementation of etcd + * The etcd file is simulated in /tmp/<server-names> + * Writes from Multiple writers are protected using file lock. +*/ + +#include "etcd-api.h" +#define MAX_KEY_LEN 64 +#define MAX_VALUE_LEN 64 +#define MAX_EXPIRE_LEN 16 + +etcd_session +etcd_open (etcd_server *server_list) +{ + return NULL; +} + +typedef struct _etcd_sim_s { + char *path; +} etcd_sim_t; + +void +etcd_close (etcd_session this) +{ + etcd_sim_t *sim = (etcd_sim_t *)this; + free(sim->path); + free(this); +} + + +char * +etcd_get_1 (FILE *stream, char *key) +{ + char *str = NULL; + size_t len; + unsigned long expires; + char *ret; + + // Read the file + while(1) { + if(str) { + free(str); + str = NULL; + } + if (getline((char **)&str, &len,stream) == -1) { + break; + } + if (!strncmp(str, key, strlen(key))) { + char k[256], s[256]; + sscanf(str,"%s %s %lu",k, s, &expires); + // check if key is expired. + if (time(NULL) > expires) { + /* Keep looking for an unexpired entry. */ + continue; + } + ret = calloc(1, strlen(s) + 1); + strcpy(ret,s); + free(str); + return(ret); + } + } + return NULL; +} + + +char * +etcd_get (etcd_session this, char *key) +{ + etcd_sim_t *sim = (etcd_sim_t *)this; + int fd; + FILE *stream; + char *retval; + + fd = open(sim->path,O_RDONLY); + if (!fd) { + return NULL; + } + + stream = fdopen(fd,"r"); + (void)flock(fd,LOCK_SH); + retval = etcd_get_1(stream,key); + (void)flock(fd,LOCK_UN); + fclose(stream); /* closes fd as well */ + + return retval; +} + + +etcd_result +etcd_set_1 (FILE *stream, char *key, char *value, + char *precond, unsigned int ttl) +{ + char *str = NULL; + char tp[255]; + size_t len; + unsigned long expires; + + while(1) { + if(str) { + free(str); + str = NULL; + } + if (getline((char **)&str, &len,stream) == -1) { + break; + } + if (!strncmp(str, key, strlen(key))) { + char k[256], s[256]; + sscanf(str,"%s %s %lu",k, s, &expires); + // check if the present key is expired + if (time(NULL) > expires) { + /* Keep looking for an unexpired entry. */ + continue; + } + /* + * The only case in which we should fail here is if a + * precondition was specified and does not match the + * current (non-expired) value. + */ + if (precond && strcmp(precond, s)) { + free(str); + return ETCD_WTF; + } + fseek(stream, -strlen(str), SEEK_CUR); + free(str); + goto here; + } + } +here: + memset(tp, 0, 255); + sprintf(tp,"%*s %*s %*lu\n", + -MAX_KEY_LEN, key, -MAX_VALUE_LEN, value, + -MAX_EXPIRE_LEN, ttl ? time(NULL) + ttl : ~0); + if (fwrite(tp, 1,strlen(tp), stream) != strlen(tp)) { + return ETCD_WTF; + } + fflush(stream); + fsync(fileno(stream)); + return ETCD_OK; +} + + +etcd_result +etcd_set (etcd_session this, char *key, char *value, + char *precond, unsigned int ttl) +{ + etcd_sim_t *sim = (etcd_sim_t *)this; + int fd; + FILE *stream; + etcd_result retval; + + fd = open(sim->path,O_RDWR); + if (fd < 0) { + return ETCD_WTF; + } + + stream = fdopen(fd,"r+"); + (void)flock(fd,LOCK_EX); + retval = etcd_set_1(stream,key,value,precond,ttl); + (void)flock(fd,LOCK_UN); + fclose(stream); /* closes fd as well */ + + return retval; +} + + +etcd_session +etcd_open_str (char *server_names) +{ + etcd_sim_t *sim; + int fd; + + sim = calloc(1, sizeof(etcd_sim_t)); + (void)asprintf(&sim->path,"/tmp/%s",server_names); + + fd = open(sim->path, O_RDWR | O_CREAT, 0777); + if (fd == -1) { + free(sim->path); + free(sim); + return NULL; + } + + close(fd); + return ((void *)sim); +} + + +void +etcd_close_str (etcd_session this) +{ + etcd_close(this); +} + +etcd_result +etcd_delete (etcd_session this, char *key) +{ + return ETCD_WTF; +} + +char * +etcd_leader (etcd_session this_as_void) +{ + return NULL; +} + +etcd_result +etcd_watch (etcd_session this, char *pfx, char **keyp, char **valuep, + int *index_in, int *index_out) +{ + return ETCD_WTF; +} + +etcd_result +etcd_lock (etcd_session session_as_void, char *key, unsigned int ttl, + char *index_in, char **index_out) +{ + char *path; + int fd; + + if (!index_in) { + if (gf_asprintf(&path,"/var/tmp/%s",key) < 0) { + return ETCD_WTF; + } + fd = open(path,O_RDWR|O_CREAT,0666); + GF_FREE(path); + if (fd < 0) { + return ETCD_WTF; + } + if (flock(fd,LOCK_EX) < 0) { + close(fd); + return ETCD_WTF; + } + *index_out = strdup("42"); + } + + /* + * Yes, we leak an fd by not closing it here (and nobody else even + * knows about it). That would be awful in any other context, but + * for test scripts it won't matter. + */ + return ETCD_OK; +} + diff --git a/xlators/cluster/nsr-server/src/gen-fops.py b/xlators/cluster/nsr-server/src/gen-fops.py new file mode 100755 index 000000000..1639f489c --- /dev/null +++ b/xlators/cluster/nsr-server/src/gen-fops.py @@ -0,0 +1,120 @@ +#!/usr/bin/python + +# This script generates the boilerplate versions of most fops and cbks in the +# server. This allows the details of leadership-status checking, sequencing +# between leader and followers (including fan-out), and basic error checking +# to be centralized one place, with per-operation code kept to a minimum. + +import sys +import codegen + +type_re = "([a-z_0-9]+)" +name_re = "\(\*fop_([a-z0-9]+)_t\)" +full_re = type_re + " *" + name_re +fop_cg = codegen.CodeGenerator() +fop_cg.skip = 2 +fop_cg.parse_decls(sys.argv[1],full_re) +fop_cg.load_templates(sys.argv[2]) + +# Use the multi-template feature to generate multiple callbacks from the same +# parsed declarations. +type_re = "([a-z_0-9]+)" +name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)" +full_re = type_re + " *" + name_re +cbk_cg = codegen.CodeGenerator() +cbk_cg.skip = 5 +cbk_cg.parse_decls(sys.argv[1],full_re) +cbk_cg.load_templates(sys.argv[2]) + +# This is a nasty little trick to handle the case where a generated fop needs +# a set of default arguments for the corresponding callback. +fop_cg.make_defaults = cbk_cg.make_defaults + +# We need two types of templates. The first, for pure read operations, just +# needs to do a simple am-i-leader check (augmented to allow dirty reads). +# The second, for pure writes, needs to do fan-out to followers between those +# initial checks and local execution. There are other operations that don't +# fit neatly into either category - e.g. lock ops or fsync - so we'll just have +# to handle those manually. The table thus includes entries only for those we +# can categorize. The special cases, plus any new operations we've never even +# heard of, aren't in there. +# +# Various keywords can be used to define/undefine preprocessor symbols used +# in the templates, on a per-function basis. For example, if the keyword here +# is "fsync" (lowercase word or abbreviation) that will cause NSR_CG_FSYNC +# (prefix plus uppercase version) to be defined above all of the generated code +# for that fop. + +fop_table = { + "access": "read", + "create": "write", + "discard": "write", +# "entrylk": "read", + "fallocate": "write", +# "fentrylk": "read", + "fgetxattr": "read", +# "finodelk": "read", +# "flush": "read", + "fremovexattr": "write", + "fsetattr": "write", + "fsetxattr": "write", + "fstat": "read", +# "fsync": "read", +# "fsyncdir": "read", + "ftruncate": "write", + "fxattrop": "write", + "getxattr": "read", +# "inodelk": "read", + "link": "write", +# "lk": "read", +# "lookup": "read", + "mkdir": "write", + "mknod": "write", + "open": "write", + "opendir": "read", + "rchecksum": "read", + "readdir": "read", + "readdirp": "read", + "readlink": "read", + "readv": "read", + "removexattr": "write", + "rename": "write", + "rmdir": "write", + "setattr": "write", + "setxattr": "write", + "stat": "read", + "statfs": "read", + "symlink": "write", + "truncate": "write", + "unlink": "write", + "writev": "write,fsync,queue", + "xattrop": "write", +} + +fops_done = [] +for x in sorted(fop_cg.decls.keys()): + if x in fop_table.keys(): + info = fop_table[x].split(",") + kind = info[0] + flags = info[1:] + if ("fsync" in flags) or ("queue" in flags): + flags.append("need_fd") + for fname in flags: + print "#define NSR_CG_%s" % fname.upper() + cbk_cg.emit(x,kind+"-complete") + fop_cg.emit(x,kind+"-continue") + cbk_cg.emit(x,kind+"-fan-in") + fop_cg.emit(x,kind+"-dispatch") + fop_cg.emit(x,kind+"-fop") + for fname in flags: + print "#undef NSR_CG_%s" % fname.upper() + fops_done.append(x) + else: + print("/* No code emitted for %s */"%x) + print("") + +# Just for fun, emit the fops table too. +print("struct xlator_fops fops = {") +for x in fops_done: + print(" .%s = nsr_%s,"%(x,x)) +print("};") diff --git a/xlators/cluster/nsr-server/src/leader.c b/xlators/cluster/nsr-server/src/leader.c new file mode 100644 index 000000000..02a2609c8 --- /dev/null +++ b/xlators/cluster/nsr-server/src/leader.c @@ -0,0 +1,138 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <regex.h> +//#include <stdlib.h> +#include <string.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" +#include "api/src/glfs.h" +#include "api/src/glfs-internal.h" + +#ifndef NSR_SIM_ETCD +#include "etcd-api.h" +#endif +#include "nsr-internal.h" +#include "../../nsr-recon/src/recon_driver.h" +#include "../../nsr-recon/src/recon_xlator.h" + +#define NSR_TTL 5 + +static void +nsr_set_leader (xlator_t *this, etcd_session etcd) +{ + long term = 0; + etcd_result res; + nsr_private_t *priv = this->private; + char n_t[sizeof(long)+1]; + char *text = NULL; + + gf_log (this->name, GF_LOG_INFO, "Just became leader"); + + text = etcd_get(etcd, priv->term_key); + if(text == NULL) { + term = 0; + } else { + term = strtol(text, NULL, 10); + } + sprintf(n_t,"%ld",term+1); + res = etcd_set(etcd, priv->term_key,n_t,text,0); + if(res != ETCD_OK) { + gf_log (this->name, GF_LOG_ERROR, "failed to set term"); + return; + } + priv->leader = _gf_true; + + priv->current_term = term + 1; + + if (priv->nsr_recon_start == _gf_false) { + atomic_fetch_and(&(priv->fence_io), 0); + return; + } + + // Move this inside recon notify??? + atomic_fetch_or(&(priv->fence_io), 1); + + nsr_recon_notify_event_set_leader(priv); + + return; +} + +void * +nsr_leader_thread (void *arg) +{ + xlator_t *this = (xlator_t *) arg; + nsr_private_t *priv = this->private; + etcd_result res; + char *index_in = NULL; + char *index_out = NULL; + + gf_log (this->name, GF_LOG_INFO, + "calling etcd_open_str on servers %s", priv->etcd_servers); + + priv->etcd = etcd_open_str(priv->etcd_servers); + if (!(priv->etcd)) { + gf_log (this->name, GF_LOG_ERROR, + "failed to open etcd session\n"); + return NULL; + } + + priv->leader_inited = 1; + + for (;;) { + /* Not leader yet. Try to become leader. */ + for (;;) { + res = etcd_lock (priv->etcd, priv->leader_key, NSR_TTL, + index_in, &index_out); + if (res == ETCD_OK) { + break; + } + gf_log (this->name, GF_LOG_WARNING, + "etcd_lock failed (%d)", res); + sleep(1); + } + /* We're there. Notify other parts of the code. */ + nsr_set_leader(this,priv->etcd); + /* Try to retain leadership. */ + index_in = index_out; + index_out = NULL; + for (;;) { + res = etcd_lock (priv->etcd, priv->leader_key, NSR_TTL, + index_in, &index_out); + if (index_out && (index_in != index_out)) { + if (index_in) { + free(index_in); + } + index_in = index_out; + index_out = NULL; + } + if (res != ETCD_OK) { + gf_log (this->name, GF_LOG_WARNING, + "lost leadership (%d)", res); + if (index_out) { + free(index_out); + } + break; + } + sleep(1); + } + } + + etcd_close_str(priv->etcd); + return NULL; +} + diff --git a/xlators/cluster/nsr-server/src/nsr-internal.h b/xlators/cluster/nsr-server/src/nsr-internal.h new file mode 100644 index 000000000..72b61bfa5 --- /dev/null +++ b/xlators/cluster/nsr-server/src/nsr-internal.h @@ -0,0 +1,101 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <sys/stat.h> +#include <sys/types.h> + +#define LEADER_XATTR "user.nsr.leader" +#define SECOND_CHILD(xl) (xl->children->next->xlator) + +enum { + gf_mt_nsr_private_t = gf_common_mt_end + 1, + gf_mt_nsr_fd_ctx_t, + gf_mt_nsr_inode_ctx_t, + gf_mt_nsr_dirty_t, + gf_mt_nsr_end +}; + +typedef enum nsr_recon_notify_ev_id_t { + NSR_RECON_SET_LEADER = 1, + NSR_RECON_ADD_CHILD = 2 +} nsr_recon_notify_ev_id_t; + +typedef struct _nsr_recon_notify_ev_s { + nsr_recon_notify_ev_id_t id; + uint32_t index; // in case of add + struct list_head list; +} nsr_recon_notify_ev_t; + +typedef struct { + char *etcd_servers; + char *subvol_uuid; + char *leader_key; + char *term_key; + char *brick_uuid; + gf_boolean_t leader; + uint8_t up_children; + uint8_t n_children; + char *vol_file; + etcd_session etcd; + volatile unsigned int fence_io; + uint32_t current_term; +#ifdef NSR_DEBUG + uint32_t leader_log_fd; +#endif + volatile int recon_notify_inited; + volatile int leader_inited; + uint32_t kid_state; + gf_lock_t dirty_lock; + struct list_head dirty_fds; + gf_boolean_t nsr_recon_start; + void * recon_ctx; + volatile uint32_t ops_in_flight; + uint32_t index; + gf_lock_t index_lock; + double quorum_pct; +} nsr_private_t; + +typedef struct { + call_stub_t *stub; + call_stub_t *qstub; + uint8_t call_count; + fd_t *fd; + struct list_head qlinks; +} nsr_local_t; + +/* + * This should match whatever changelog returns on the pre-op for us to pass + * when we're ready for our post-op. + */ +typedef uint32_t log_id_t; + +typedef struct { + struct list_head links; + log_id_t id; +} nsr_dirty_list_t; + +typedef struct { + fd_t *fd; + struct list_head dirty_list; + struct list_head fd_list; +} nsr_fd_ctx_t; + +typedef struct { + gf_lock_t lock; + uint32_t active; + struct list_head aqueue; + uint32_t pending; + struct list_head pqueue; +} nsr_inode_ctx_t; + +void nsr_recon_notify_event_set_leader(nsr_private_t *priv); +void nsr_recon_notify_event_add_child(nsr_private_t *priv, uint32_t index); +void* nsr_recon_notify_thread (void *this); + diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c new file mode 100644 index 000000000..85eba09b5 --- /dev/null +++ b/xlators/cluster/nsr-server/src/nsr.c @@ -0,0 +1,812 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" +#include "api/src/glfs.h" +#include "api/src/glfs-internal.h" +#include "run.h" +#include "common-utils.h" +#include "syncop.h" + +#include "etcd-api.h" +#include "nsr-internal.h" +#include "../../nsr-recon/src/recon_driver.h" +#include "../../nsr-recon/src/recon_xlator.h" + + +#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd" +#define GLUSTERD_VOLUME_DIR_PREFIX "vols" +#define GLUSTERD_BRICK_INFO_DIR "bricks" + +#define NSR_FLUSH_INTERVAL 5 + +nsr_inode_ctx_t * +nsr_get_inode_ctx (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_int = 0LL; + nsr_inode_ctx_t *ctx_ptr; + + if (__inode_ctx_get(inode,this,&ctx_int) == 0) { + ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int; + } + else { + ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), + gf_mt_nsr_inode_ctx_t); + if (ctx_ptr) { + ctx_int = (uint64_t)(long)ctx_ptr; + if (__inode_ctx_set(inode,this,&ctx_int) == 0) { + LOCK_INIT(&ctx_ptr->lock); + INIT_LIST_HEAD(&ctx_ptr->aqueue); + INIT_LIST_HEAD(&ctx_ptr->pqueue); + } + else { + GF_FREE(ctx_ptr); + ctx_ptr = NULL; + } + } + + } + + return ctx_ptr; +} + +nsr_fd_ctx_t * +nsr_get_fd_ctx (xlator_t *this, fd_t *fd) +{ + uint64_t ctx_int = 0LL; + nsr_fd_ctx_t *ctx_ptr; + + if (__fd_ctx_get(fd,this,&ctx_int) == 0) { + ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; + } + else { + ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t); + if (ctx_ptr) { + if (__fd_ctx_set(fd,this,(uint64_t)ctx_ptr) == 0) { + INIT_LIST_HEAD(&ctx_ptr->dirty_list); + INIT_LIST_HEAD(&ctx_ptr->fd_list); + } + else { + GF_FREE(ctx_ptr); + ctx_ptr = NULL; + } + } + + } + + return ctx_ptr; +} + +void +nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local) +{ + fd_t *fd = local->fd; + nsr_fd_ctx_t *ctx_ptr; + nsr_dirty_list_t *dirty; + nsr_private_t *priv = this->private; + + /* + * TBD: don't do any of this for O_SYNC/O_DIRECT writes. + * Unfortunately, that optimization requires that we distinguish + * between writev and other "write" calls, saving the original flags + * and checking them in the callback. Too much work for too little + * gain right now. + */ + + LOCK(&fd->lock); + ctx_ptr = nsr_get_fd_ctx(this,fd); + dirty = GF_CALLOC(1,sizeof(*dirty),gf_mt_nsr_dirty_t); + if (ctx_ptr && dirty) { + gf_log (this->name, GF_LOG_TRACE, + "marking fd %p as dirty (%p)", fd, dirty); + /* TBD: fill dirty->id from what changelog gave us */ + list_add_tail(&dirty->links,&ctx_ptr->dirty_list); + if (list_empty(&ctx_ptr->fd_list)) { + /* Add a ref so _release doesn't get called. */ + ctx_ptr->fd = fd_ref(fd); + LOCK(&priv->dirty_lock); + list_add_tail (&ctx_ptr->fd_list, + &priv->dirty_fds); + UNLOCK(&priv->dirty_lock); + } + } + else { + gf_log (this->name, GF_LOG_ERROR, + "could not mark %p dirty", fd); + if (ctx_ptr) { + GF_FREE(ctx_ptr); + } + if (dirty) { + GF_FREE(dirty); + } + } + UNLOCK(&fd->lock); +} + +#define NSR_TERM_XATTR "trusted.nsr.term" +#define NSR_INDEX_XATTR "trusted.nsr.index" +#define RECON_TERM_XATTR "trusted.nsr.recon-term" +#define RECON_INDEX_XATTR "trusted.nsr.recon-index" +#define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count" +#include "nsr-cg.c" + +uint8_t +nsr_count_up_kids (nsr_private_t *priv) +{ + uint8_t retval = 0; + uint8_t i; + + for (i = 0; i < priv->n_children; ++i) { + if (priv->kid_state & (1 << i)) { + ++retval; + } + } + + return retval; +} + +/* + * The fsync machinery looks a lot like that for any write call, but there are + * some important differences that are easy to miss. First, we don't care + * about the xdata that shows whether the call came from a leader or + * reconciliation process. If we're the leader we fan out; if we're not we + * don't. Second, we don't wait for followers before we issue the local call. + * The code generation system could be updated to handle this, and still might + * if we need to implement other "almost identical" paths (e.g. for open), but + * a copy is more readable as long as it's just one. + */ + +int32_t +nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + nsr_local_t *local = frame->local; + gf_boolean_t unwind; + + LOCK(&frame->lock); + unwind = !--(local->call_count); + UNLOCK(&frame->lock); + + if (unwind) { + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + } + return 0; +} + +int32_t +nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + nsr_dirty_list_t *dirty; + nsr_dirty_list_t *dtmp; + nsr_local_t *local = frame->local; + + list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) { + gf_log (this->name, GF_LOG_TRACE, + "sending post-op on %p (%p)", local->fd, dirty); + GF_FREE(dirty); + } + + return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} + +int32_t +nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + nsr_private_t *priv = this->private; + nsr_local_t *local; + uint64_t ctx_int = 0LL; + nsr_fd_ctx_t *ctx_ptr; + xlator_list_t *trav; + + local = mem_get0(this->local_pool); + if (!local) { + STACK_UNWIND_STRICT(fsync,frame,-1,ENOMEM,NULL,NULL,xdata); + return 0; + } + INIT_LIST_HEAD(&local->qlinks); + frame->local = local; + + /* Move the dirty list from the fd to the fsync request. */ + LOCK(&fd->lock); + if (__fd_ctx_get(fd,this,&ctx_int) == 0) { + ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; + list_splice_init (&ctx_ptr->dirty_list, + &local->qlinks); + } + UNLOCK(&fd->lock); + + /* Issue the local call. */ + local->call_count = priv->leader ? priv->n_children : 1; + STACK_WIND (frame, nsr_fsync_local_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + + /* Issue remote calls if we're the leader. */ + if (priv->leader) { + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + } + } + + return 0; +} + +int32_t +nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + dict_t *result; + nsr_private_t *priv = this->private; + + if (!priv->leader) { + STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL); + return 0; + } + + if (!name || (strcmp(name,NSR_REP_COUNT_XATTR) != 0)) { + STACK_WIND_TAIL (frame, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + return 0; + } + + result = dict_new(); + if (!result) { + goto dn_failed; + } + + priv->up_children = nsr_count_up_kids(this->private); + if (dict_set_uint32(result,NSR_REP_COUNT_XATTR,priv->up_children) != 0) { + goto dsu_failed; + } + + STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL); + dict_destroy(result); + return 0; + +dsu_failed: + dict_destroy(result); +dn_failed: + STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +void +nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx) +{ + nsr_dirty_list_t *dirty; + nsr_dirty_list_t *dtmp; + + list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) { + gf_log (this->name, GF_LOG_TRACE, + "sending post-op on %p (%p)", fd_ctx->fd, dirty); + GF_FREE(dirty); + } + + INIT_LIST_HEAD(&fd_ctx->dirty_list); +} + +void * +nsr_flush_thread (void *ctx) +{ + xlator_t *this = ctx; + nsr_private_t *priv = this->private; + struct list_head dirty_fds; + nsr_fd_ctx_t *fd_ctx; + nsr_fd_ctx_t *fd_tmp; + int ret; + + for (;;) { + /* + * We have to be very careful to avoid lock inversions here, so + * we can't just hold priv->dirty_lock while we take and + * release locks for each fd. Instead, we only hold dirty_lock + * at the beginning of each iteration, as we (effectively) make + * a copy of the current list head and then clear the original. + * This leads to four scenarios for adding the first entry to + * an fd and potentially putting it on the global list. + * + * (1) While we're asleep. No lock contention, it just gets + * added and will be processed on the next iteration. + * + * (2) After we've made a local copy, but before we've started + * processing that fd. The new entry will be added to the + * fd (under its lock), and we'll process it on the current + * iteration. + * + * (3) While we're processing the fd. They'll block on the fd + * lock, then see that the list is empty and put it on the + * global list. We'll process it here on the next + * iteration. + * + * (4) While we're working, but after we've processed that fd. + * Same as (1) as far as that fd is concerned. + */ + INIT_LIST_HEAD(&dirty_fds); + LOCK(&priv->dirty_lock); + list_splice_init(&priv->dirty_fds,&dirty_fds); + UNLOCK(&priv->dirty_lock); + + list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) { + ret = syncop_fsync(FIRST_CHILD(this),fd_ctx->fd,0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to fsync %p (%d)", + fd_ctx->fd, -ret); + } + + LOCK(&fd_ctx->fd->lock); + nsr_flush_fd(this,fd_ctx); + list_del_init(&fd_ctx->fd_list); + UNLOCK(&fd_ctx->fd->lock); + fd_unref(fd_ctx->fd); + } + + sleep(NSR_FLUSH_INTERVAL); + } + + return NULL; +} + +int32_t +nsr_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx = 0LL; + + if ((inode_ctx_del(inode,this,&ctx) == 0) && ctx) { + GF_FREE((void *)(long)ctx); + } + + return 0; +} + +int32_t +nsr_release (xlator_t *this, fd_t *fd) +{ + uint64_t ctx = 0LL; + + if ((fd_ctx_del(fd,this,&ctx) == 0) && ctx) { + GF_FREE((void *)(long)ctx); + } + + return 0; +} + +struct xlator_cbks cbks = { + .forget = nsr_forget, + .release = nsr_release, +}; + +int +nsr_reconfigure (xlator_t *this, dict_t *options) +{ + nsr_private_t *priv = this->private; + + GF_OPTION_RECONF ("leader", priv->leader, options, bool, err); + gf_log (this->name, GF_LOG_INFO, + "reconfigure called. setting priv->leader to %d\n", priv->leader); + return 0; + +err: + return -1; +} + +int +nsr_get_child_index (xlator_t *this, xlator_t *kid) +{ + xlator_list_t *trav; + int retval = -1; + + for (trav = this->children; trav; trav = trav->next) { + ++retval; + if (trav->xlator == kid) { + return retval; + } + } + + return -1; +} + +/* + * Child notify handling is unreasonably FUBAR. Sometimes we'll get a + * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it. + * Other times we won't. Because it's effectively random (probably racy), we + * can't just maintain a count. We actually have to keep track of the state + * for each child separately, to filter out the bogus CHILD_DOWN events, and + * then generate counts on demand. + */ +int +nsr_notify (xlator_t *this, int event, void *data, ...) +{ + nsr_private_t *priv = this->private; + int index; + + switch (event) { + case GF_EVENT_CHILD_UP: + index = nsr_get_child_index(this,data); + if (index >= 0) { + priv->kid_state |= (1 << index); + priv->up_children = nsr_count_up_kids(priv); + gf_log (this->name, GF_LOG_INFO, + "got CHILD_UP for %s, now %u kids", + ((xlator_t *)data)->name, + priv->up_children); + if (priv->nsr_recon_start == _gf_true) { + nsr_recon_notify_event_add_child(priv, index); + } + } + break; + case GF_EVENT_CHILD_DOWN: + index = nsr_get_child_index(this,data); + if (index >= 0) { + priv->kid_state &= ~(1 << index); + priv->up_children = nsr_count_up_kids(priv); + gf_log (this->name, GF_LOG_INFO, + "got CHILD_DOWN for %s, now %u kids", + ((xlator_t *)data)->name, + priv->up_children); + } + break; + default: + ; + } + + return default_notify(this,event,data); +} + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("nsr", this, out); + + ret = xlator_mem_acct_init (this, gf_mt_nsr_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Memory accounting init" "failed"); + return ret; + } +out: + return ret; +} + + +extern void *nsr_leader_thread (void *); + +void +nsr_deallocate_priv (nsr_private_t *priv) +{ + if (!priv) { + return; + } + + if (priv->leader_key) { + GF_FREE(priv->leader_key); + } + + if (priv->term_key) { + GF_FREE(priv->term_key); + } + + GF_FREE(priv); +} + + +int32_t +nsr_init (xlator_t *this) +{ + xlator_list_t *remote; + xlator_list_t *local; + nsr_private_t *priv = NULL; + xlator_list_t *trav; + pthread_t kid; + uuid_t tmp_uuid; + char *my_name = NULL, *morph_name = NULL, *recon_file = NULL, *recon_pid_file = NULL, *ptr = NULL; + char *volname; + extern xlator_t global_xlator; + glusterfs_ctx_t *oldctx = global_xlator.ctx; + runner_t runner = {0,}; + int32_t ret = -1; + struct stat buf; + char *recon_log = NULL, *recon_log_dir = NULL; + + /* + * Any fop that gets special treatment has to be patched in here, + * because the compiled-in table is produced by the code generator and + * only contains generated functions. Note that we have to go through + * this->fops because of some dynamic-linking strangeness; modifying + * the static table doesn't work. + */ + this->fops->getxattr = nsr_getxattr_special; + this->fops->fsync = nsr_fsync; + + local = this->children; + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "no local subvolume"); + goto err; + } + + remote = local->next; + if (!remote) { + gf_log (this->name, GF_LOG_ERROR, "no remote subvolumes"); + goto err; + } + + this->local_pool = mem_pool_new (nsr_local_t, 128); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create nsr_local_t pool"); + goto err; + } + + priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t); + if (!priv) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate priv"); + goto err; + } + + // set this so that unless leader election is done, IO is fenced + priv->fence_io = 1; + + for (trav = this->children; trav; trav = trav->next) { + ++(priv->n_children); + } + + LOCK_INIT(&priv->dirty_lock); + LOCK_INIT(&priv->index_lock); + INIT_LIST_HEAD(&priv->dirty_fds); + + this->private = priv; + + GF_OPTION_INIT ("etcd-servers", priv->etcd_servers, str, err); + if (!priv->etcd_servers) { + gf_log (this->name, GF_LOG_ERROR, "etcd servers not generated. ???"); + goto err; + } + + + GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err); + + GF_OPTION_INIT ("subvol-uuid", priv->subvol_uuid, str, err); + gf_log (this->name, GF_LOG_INFO, "subvol_uuid = %s", priv->subvol_uuid); + if (gf_asprintf(&priv->leader_key,"%s:leader",priv->subvol_uuid) <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not generate leader key"); + goto err; + } + if (gf_asprintf(&priv->term_key,"%s:term",priv->subvol_uuid) <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not generate term key"); + goto err; + } + uuid_generate(tmp_uuid); + priv->brick_uuid = strdup(uuid_utoa(tmp_uuid)); + gf_log (this->name, GF_LOG_INFO, "brick_uuid = %s\n", priv->brick_uuid); + + GF_OPTION_INIT ("my-name", my_name, str, err); + if (!my_name) { + gf_log (this->name, GF_LOG_ERROR, "brick name not generated. ???"); + goto err; + } + GF_OPTION_INIT ("vol-name", volname, str, err); + if (!volname) { + gf_log (this->name, GF_LOG_ERROR, "vol name not generated. ???"); + goto err; + } + + morph_name = GF_CALLOC (1, strlen(my_name) + 1, gf_mt_nsr_private_t); + strcpy(morph_name, my_name); + recon_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("con") +1, gf_mt_nsr_private_t); + recon_pid_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("recon") +1, gf_mt_nsr_private_t); + if ((!recon_file) || (!recon_pid_file)) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name"); + goto err; + } + ptr = strchr (morph_name, '/'); + while (ptr) { + *ptr = '-'; + ptr = strchr (morph_name, '/'); + } + + sprintf(recon_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + GLUSTERD_BRICK_INFO_DIR); + strcat(recon_file, morph_name); + strcat(recon_file, "-nsr-recon.vol"); + + sprintf(recon_pid_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + "run"); + strcat(recon_pid_file, morph_name); + strcat(recon_pid_file, "-recon.pid"); + + priv->vol_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("con") +1, gf_mt_nsr_private_t); + if (!priv->vol_file) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name"); + goto err; + } + sprintf(priv->vol_file,"%s/%s/%s/%s/", + GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + GLUSTERD_BRICK_INFO_DIR); + strcat(priv->vol_file, "con:"); + strcat(priv->vol_file, morph_name); + + if (pthread_create(&kid,NULL,nsr_flush_thread,this) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not start flush thread"); + /* TBD: treat this as a fatal error? */ + } + + // Start the recon process. Then start the leader thread. + /* + * REVIEW + * Logs belong in /var/log not /tmp. + */ + + ret = mkdir (NSR_LOG_DIR, 0777); + if (ret != 0) { + if (errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't create" + " nsr log directory (%s)", strerror (errno)); + goto err; + } + } + + recon_log_dir = GF_CALLOC (1, strlen (NSR_LOG_DIR) + strlen(morph_name) + + 2, gf_mt_nsr_private_t); + if (!recon_log_dir) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't allocate recon log " + "dir name"); + goto err; + } + sprintf (recon_log_dir, "%s/%s", NSR_LOG_DIR, morph_name); + ret = mkdir (recon_log_dir, 0777); + + if (ret != 0){ + if (errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, + "Couldn't create brick log dir (%s)", + strerror (errno)); + goto err; + } + } + + recon_log = GF_CALLOC (1, strlen (recon_log_dir)+ + strlen ("reconciliation.log") + 2, + gf_mt_nsr_private_t); + if (!recon_log) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't allocate recon log" + " file name"); + goto err; + } + sprintf (recon_log, "%s/reconciliation.log", recon_log_dir); + + if (!stat(priv->vol_file, &buf)) { + + runinit (&runner); + runner_add_args(&runner, SBIN_DIR "/glusterfs", + "-f", recon_file, + "-p", recon_pid_file, + "-l", recon_log, + NULL); + ret = runner_run (&runner); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not exec reconciliation process %s", + SBIN_DIR "/glusterfs"); + goto err; + } + + // TBD - convert this to make sure recon process runs + sleep(2); + priv->nsr_recon_start = _gf_true; + } + + + (void)pthread_create(&kid,NULL,nsr_recon_notify_thread,this); + while (priv->recon_notify_inited == 0) { + sleep(1); + } + + if (pthread_create(&kid,NULL,nsr_leader_thread,this) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to start leader thread"); + } + while (priv->leader_inited == 0) { + sleep(1); + } + + + /* + * Calling glfs_new changes old->ctx, even if THIS still points + * to global_xlator. That causes problems later in the main + * thread, when gf_log_dump_graph tries to use the FILE after + * we've mucked with it and gets a segfault in __fprintf_chk. + * We can avoid all that by undoing the damage before we + * continue. + */ + global_xlator.ctx = oldctx; + + return 0; + +err: + nsr_deallocate_priv(priv); + return -1; +} + + +void +nsr_fini (xlator_t *this) +{ + nsr_deallocate_priv(this->private); +} + +class_methods_t class_methods = { + .init = nsr_init, + .fini = nsr_fini, + .reconfigure = nsr_reconfigure, + .notify = nsr_notify, +}; + +struct volume_options options[] = { + { .key = {"leader"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "Start in the leader role. This is only for " + "bootstrapping the code, and should go away when we " + "have real leader election." + }, + { .key ={"vol-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "volume name" + }, + { .key = {"my-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "brick name in form of host:/path" + }, + { .key = {"etcd-servers"}, + .type = GF_OPTION_TYPE_STR, + .description = "list of comma seperated etc servers" + }, + { .key = {"subvol-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "UUID for this NSR (sub)volume" + }, + { .key = {"quorum-percent"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "50.0", + .description = "percentage of rep_count-1 that must be up" + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/nsr-server/src/recon_notify.c b/xlators/cluster/nsr-server/src/recon_notify.c new file mode 100644 index 000000000..1c50de234 --- /dev/null +++ b/xlators/cluster/nsr-server/src/recon_notify.c @@ -0,0 +1,389 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <string.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" +#include "api/src/glfs.h" +#include "api/src/glfs-internal.h" +#include "etcd-api.h" +#include "nsr-internal.h" +#include "../../nsr-recon/src/recon_driver.h" +#include "../../nsr-recon/src/recon_xlator.h" + + + +typedef struct _nsr_recon_notify_ctx_t { + nsr_recon_notify_ev_t recon_head; + pthread_mutex_t recon_mutex; + pthread_cond_t recon_cv; + char **hosts; // list of hosts ordered depending on child indices + uint32_t current_term; + uint32_t last_reconciled_term; + glfs_t *fs; + glfs_fd_t *fd; +} nsr_recon_notify_ctx_t; + +static int +xlator_get_option (xlator_t *xl, char *key, char **value) +{ + GF_ASSERT (xl); + return dict_get_str (xl->options, key, value); +} + +void nsr_recon_notify_event_set_leader(nsr_private_t *priv) +{ + nsr_recon_notify_ev_t *ev; + nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx; + + ev = GF_CALLOC (1, sizeof (nsr_recon_notify_ev_t), 0); + ev->id = NSR_RECON_SET_LEADER; + INIT_LIST_HEAD(&(ev->list)); + pthread_mutex_lock(&ctx->recon_mutex); + list_add_tail(&ev->list, &ctx->recon_head.list); + pthread_cond_signal(&ctx->recon_cv); + pthread_mutex_unlock(&ctx->recon_mutex); +} + +void nsr_recon_notify_event_add_child(nsr_private_t *priv, uint32_t index) +{ + nsr_recon_notify_ev_t *ev; + nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx; + + ev = GF_CALLOC (1, sizeof (nsr_recon_notify_ev_t), 0); + ev->id = NSR_RECON_ADD_CHILD; + ev->index = index; + INIT_LIST_HEAD(&(ev->list)); + pthread_mutex_lock(&ctx->recon_mutex); + list_add_tail(&ev->list, &ctx->recon_head.list); + pthread_cond_signal(&ctx->recon_cv); + pthread_mutex_unlock(&ctx->recon_mutex); +} + + +static void +nsr_recon_set_leader (xlator_t *this) +{ + + nsr_private_t *priv = this->private; + nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx; + nsr_recon_role_t role; + xlator_t *old = this; + uint32_t i=0; + + if (priv->leader != _gf_true) + return; + + if (ctx->last_reconciled_term == priv->current_term) + return; + + /* + * Quorum for reconciliation is not the same as quorum for I/O. Here, + * we require a true majority. The +1 is because we don't count + * ourselves as part of n_children or up_children. + * + * TBD: re-evaluate when to reconcile (including partial) + */ + if (priv->up_children <= (priv->n_children / 2)) + return; + + gf_log (this->name, GF_LOG_INFO, + "Sending message to do recon with %d nodes\n", + priv->up_children); + + role.num = 0; + role.role = leader; + for (i = 0; i < priv->n_children; ++i) { + if (priv->kid_state & (1 << i)) { + gf_log (this->name, GF_LOG_INFO, + "Recon using host %s", + ctx->hosts[i]); + strcpy(role.info[role.num].name, ctx->hosts[i]); + (role.num)++; + } + } + + gf_log (this->name, GF_LOG_INFO, + "setting current term as %d", priv->current_term); + role.current_term = priv->current_term; + ENDIAN_CONVERSION_RR(role, _gf_false); + + // inform the reconciliator that this is leader + // in the callback (once reconciliation is done), + // we will unfence the IOs. + // TBD - error handling later. + if (glfs_lseek(ctx->fd, nsr_recon_xlator_sector_1, SEEK_SET) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "doing lseek failed\n"); + return; + } + + glusterfs_this_set(old); + gf_log (this->name, GF_LOG_INFO, + "Writing to local node to set leader"); + do { + if (priv->leader != _gf_true) { + glusterfs_this_set(old); + gf_log (this->name, GF_LOG_ERROR, "no longer leader\n"); + return; + } + if (glfs_write(ctx->fd, &role, sizeof(role), 0) == -1) { + if (errno == EAGAIN) { + // Wait for old reconciliation to bail out. + glusterfs_this_set(old); + gf_log (this->name, GF_LOG_ERROR, + "write failed with retry. retrying after some time\n"); + sleep(5); + continue; + } + else{ + glusterfs_this_set(old); + gf_log (this->name, GF_LOG_ERROR, + "doing write failed\n"); + // This is because reconciliation has returned with error + // because some node has died in between. + // What should be done? Either we retry being leader + // or hook to CHILD_DOWN notification. + // Put that logic later. As of now we will just retry. + // This is easier. + sleep(5); + continue; + } + } else { + glusterfs_this_set(old); + gf_log (this->name, GF_LOG_INFO, "doing write with success\n"); + break; + } + } while(1); + glusterfs_this_set(old); + gf_log (this->name, GF_LOG_INFO, + "glfs_write returned. unfencing IO\n"); + + // TBD - error handling + + ctx->last_reconciled_term = priv->current_term; + priv->index = 0; // reset changelog index + atomic_fetch_and(&(priv->fence_io), 0); + + return; +} + +static void +nsr_recon_add_child (xlator_t *this, uint32_t index) +{ + nsr_private_t *priv = this->private; + nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx; + nsr_recon_role_t role; + xlator_t *old = this; + + if (priv->leader != _gf_true) + return; + + // reconciliation still pending. + // Check if we have majority + if (ctx->last_reconciled_term != priv->current_term) { + nsr_recon_set_leader(this); + } else { + // Reconciliation done. + // new child joining the majority/ + // Do reconciliation only fot this child but after fencing new IO and draining old IO + role.num = 1; + role.role = joiner; + + atomic_fetch_or(&(priv->fence_io), 1); + while(priv->ops_in_flight) { + sleep(1); + } + + strcpy(role.info[0].name, ctx->hosts[index]); + role.current_term = priv->current_term; + ENDIAN_CONVERSION_RR(role, _gf_false); + glfs_lseek(ctx->fd, nsr_recon_xlator_sector_1, SEEK_SET); + glusterfs_this_set(old); + gf_log (this->name, GF_LOG_INFO, + "Writing to local node to join %s\n", role.info[0].name); + glfs_write(ctx->fd, &role, + sizeof(role), 0); + glusterfs_this_set(old); + gf_log (this->name, GF_LOG_INFO, + "Write to local node to set joiner returned\n"); + + // TBD - error handling + atomic_fetch_and(&(priv->fence_io), 0); + } + + return; +} + +static uint32_t +nsr_setup_recon (xlator_t *this) +{ + nsr_private_t *priv = this->private; + xlator_t *old = this; + uint32_t ret = 0; + nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx; + + if (priv->nsr_recon_start == _gf_false) + return 0; + + ctx->fs = glfs_new(priv->subvol_uuid); + if (!ctx->fs) { + ret = 1; + gf_log (this->name, GF_LOG_ERROR, "failed to initialise glfs \n"); + goto done; + } + + glusterfs_this_set(old); + ret = glfs_set_volfile(ctx->fs, priv->vol_file); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to set volfile \n"); + goto done; + } + + glusterfs_this_set(old); + /* + * REVIEW + * Logs belong in /var/log not /tmp. + */ + glfs_set_logging (ctx->fs,"/tmp/glfs-log", 7); + if (glfs_init(ctx->fs) < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to init volfile \n"); + ret = 1; + goto done; + } + + glusterfs_this_set(old); + ctx->fd = glfs_open (ctx->fs, "/", O_RDWR); + if (ctx->fd == NULL) { + ret = 1; + gf_log (this->name, GF_LOG_ERROR, + "failed to open fd to communicate with recon process \n"); + goto done; + } + + +done: + glusterfs_this_set(old); + return ret; +} + + +static void +nsr_setup_hosts(xlator_t *this) +{ + xlator_list_t *trav; + nsr_private_t *priv = this->private; + uint32_t i = 0; + nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx; + + ctx->hosts = GF_CALLOC(sizeof(char *), priv->n_children, gf_mt_nsr_private_t); + // Iterate thru all the children + for (trav = this->children; trav; trav = trav->next) { + char *hostname = NULL, *vol = NULL; + int ret1 = 0, ret2 = 0, ret = 0; + xlator_t *xl = trav->xlator; + // If the child type is that of protocol/client + if (!strcmp(trav->xlator->type, "protocol/client")) { + ret1 = xlator_get_option (xl, "remote-host", &hostname); + ret2 = xlator_get_option (xl, "remote-subvolume", &vol); + if (!ret1 && !ret2) { + // add the name of that host to the hosts + ctx->hosts[i] = GF_CALLOC(sizeof(char), strlen(hostname) + strlen(vol) + 2, 0); + strcpy(ctx->hosts[i], hostname); + strcat(ctx->hosts[i], ":"); + strcat(ctx->hosts[i], vol); + gf_log (this->name, GF_LOG_INFO, + "adding hosts %s to recon notfiy list", ctx->hosts[i]); + } else { + gf_log (this->name, GF_LOG_ERROR, + "CANNOT FIND HOSTNAME FOR A CHILD"); + GF_ASSERT(0); + } + // local brick + } else { + ret = xlator_get_option (this, "my-name", &hostname); + if (!ret) { + uint32_t len = strlen(hostname); + ctx->hosts[i] = GF_CALLOC(sizeof(char), + len+1, + gf_mt_nsr_private_t); + strcpy(ctx->hosts[i], hostname); + gf_log (this->name, GF_LOG_INFO, + "adding my host %s to recon notfiy list", ctx->hosts[i]); + } else { + gf_log (this->name, GF_LOG_ERROR, + "CANNOT FIND MY HOSTNAME"); + GF_ASSERT(0); + } + } + i++; + } +} + +void * +nsr_recon_notify_thread (void *arg) +{ + xlator_t *this = (xlator_t *)arg; + nsr_private_t *priv = this->private; + nsr_recon_notify_ev_t *ev; + nsr_recon_notify_ctx_t *ctx; + + priv->recon_ctx = GF_CALLOC(1, sizeof(nsr_recon_notify_ctx_t), gf_mt_nsr_private_t); + if (!priv->recon_ctx) { + gf_log (this->name, GF_LOG_ERROR, "calloc error"); + return NULL; + } + ctx = priv->recon_ctx; + + pthread_mutex_init(&(ctx->recon_mutex), NULL); + pthread_cond_init(&(ctx->recon_cv), NULL); + INIT_LIST_HEAD(&(ctx->recon_head.list)); + + nsr_setup_hosts(this); + + if (nsr_setup_recon(this)) { + gf_log (this->name, GF_LOG_ERROR, "recon notify thread : initing glfs error"); + return NULL; + } + + priv->recon_notify_inited = 1; + + while(1) { + pthread_mutex_lock(&ctx->recon_mutex); + while (list_empty(&(ctx->recon_head.list))) { + pthread_cond_wait(&ctx->recon_cv, &ctx->recon_mutex); + } + pthread_mutex_unlock(&ctx->recon_mutex); + + list_for_each_entry(ev, &(ctx->recon_head.list), list) { + + if (ev->id == NSR_RECON_SET_LEADER) { + gf_log (this->name, GF_LOG_INFO, + "got add leader notfiy event"); + nsr_recon_set_leader(this); + } else if (ev->id == NSR_RECON_ADD_CHILD) { + gf_log (this->name, GF_LOG_INFO, + "got add child notify event"); + nsr_recon_add_child(this, ev->index); + } + } + list_del_init (&ev->list); + } + + return NULL; +} + diff --git a/xlators/cluster/nsr-server/src/yajl.c b/xlators/cluster/nsr-server/src/yajl.c new file mode 100644 index 000000000..54e6474fc --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl/yajl_parse.h" +#include "yajl_lex.h" +#include "yajl_parser.h" +#include "yajl_alloc.h" + +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <assert.h> + +const char * +yajl_status_to_string(yajl_status stat) +{ + const char * statStr = "unknown"; + switch (stat) { + case yajl_status_ok: + statStr = "ok, no error"; + break; + case yajl_status_client_canceled: + statStr = "client canceled parse"; + break; + case yajl_status_error: + statStr = "parse error"; + break; + } + return statStr; +} + +yajl_handle +yajl_alloc(const yajl_callbacks * callbacks, + yajl_alloc_funcs * afs, + void * ctx) +{ + yajl_handle hand = NULL; + yajl_alloc_funcs afsBuffer; + + /* first order of business is to set up memory allocation routines */ + if (afs != NULL) { + if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL) + { + return NULL; + } + } else { + yajl_set_default_alloc_funcs(&afsBuffer); + afs = &afsBuffer; + } + + hand = (yajl_handle) YA_MALLOC(afs, sizeof(struct yajl_handle_t)); + + /* copy in pointers to allocation routines */ + memcpy((void *) &(hand->alloc), (void *) afs, sizeof(yajl_alloc_funcs)); + + hand->callbacks = callbacks; + hand->ctx = ctx; + hand->lexer = NULL; + hand->bytesConsumed = 0; + hand->decodeBuf = yajl_buf_alloc(&(hand->alloc)); + hand->flags = 0; + yajl_bs_init(hand->stateStack, &(hand->alloc)); + yajl_bs_push(hand->stateStack, yajl_state_start); + + return hand; +} + +int +yajl_config(yajl_handle h, yajl_option opt, ...) +{ + int rv = 1; + va_list ap; + va_start(ap, opt); + + switch(opt) { + case yajl_allow_comments: + case yajl_dont_validate_strings: + case yajl_allow_trailing_garbage: + case yajl_allow_multiple_values: + case yajl_allow_partial_values: + if (va_arg(ap, int)) h->flags |= opt; + else h->flags &= ~opt; + break; + default: + rv = 0; + } + va_end(ap); + + return rv; +} + +void +yajl_free(yajl_handle handle) +{ + yajl_bs_free(handle->stateStack); + yajl_buf_free(handle->decodeBuf); + if (handle->lexer) { + yajl_lex_free(handle->lexer); + handle->lexer = NULL; + } + YA_FREE(&(handle->alloc), handle); +} + +yajl_status +yajl_parse(yajl_handle hand, const unsigned char * jsonText, + size_t jsonTextLen) +{ + yajl_status status; + + /* lazy allocation of the lexer */ + if (hand->lexer == NULL) { + hand->lexer = yajl_lex_alloc(&(hand->alloc), + hand->flags & yajl_allow_comments, + !(hand->flags & yajl_dont_validate_strings)); + } + + status = yajl_do_parse(hand, jsonText, jsonTextLen); + return status; +} + + +yajl_status +yajl_complete_parse(yajl_handle hand) +{ + /* The lexer is lazy allocated in the first call to parse. if parse is + * never called, then no data was provided to parse at all. This is a + * "premature EOF" error unless yajl_allow_partial_values is specified. + * allocating the lexer now is the simplest possible way to handle this + * case while preserving all the other semantics of the parser + * (multiple values, partial values, etc). */ + if (hand->lexer == NULL) { + hand->lexer = yajl_lex_alloc(&(hand->alloc), + hand->flags & yajl_allow_comments, + !(hand->flags & yajl_dont_validate_strings)); + } + + return yajl_do_finish(hand); +} + +unsigned char * +yajl_get_error(yajl_handle hand, int verbose, + const unsigned char * jsonText, size_t jsonTextLen) +{ + return yajl_render_error_string(hand, jsonText, jsonTextLen, verbose); +} + +size_t +yajl_get_bytes_consumed(yajl_handle hand) +{ + if (!hand) return 0; + else return hand->bytesConsumed; +} + + +void +yajl_free_error(yajl_handle hand, unsigned char * str) +{ + /* use memory allocation functions if set */ + YA_FREE(&(hand->alloc), str); +} + +/* XXX: add utility routines to parse from file */ diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_common.h b/xlators/cluster/nsr-server/src/yajl/yajl_common.h new file mode 100644 index 000000000..49ca3a5cb --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_common.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_COMMON_H__ +#define __YAJL_COMMON_H__ + +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define YAJL_MAX_DEPTH 128 + +/* msft dll export gunk. To build a DLL on windows, you + * must define WIN32, YAJL_SHARED, and YAJL_BUILD. To use a shared + * DLL, you must define YAJL_SHARED and WIN32 */ +#if defined(WIN32) && defined(YAJL_SHARED) +# ifdef YAJL_BUILD +# define YAJL_API __declspec(dllexport) +# else +# define YAJL_API __declspec(dllimport) +# endif +#else +# if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303 +# define YAJL_API __attribute__ ((visibility("default"))) +# else +# define YAJL_API +# endif +#endif + +/** pointer to a malloc function, supporting client overriding memory + * allocation routines */ +typedef void * (*yajl_malloc_func)(void *ctx, size_t sz); + +/** pointer to a free function, supporting client overriding memory + * allocation routines */ +typedef void (*yajl_free_func)(void *ctx, void * ptr); + +/** pointer to a realloc function which can resize an allocation. */ +typedef void * (*yajl_realloc_func)(void *ctx, void * ptr, size_t sz); + +/** A structure which can be passed to yajl_*_alloc routines to allow the + * client to specify memory allocation functions to be used. */ +typedef struct +{ + /** pointer to a function that can allocate uninitialized memory */ + yajl_malloc_func malloc; + /** pointer to a function that can resize memory allocations */ + yajl_realloc_func realloc; + /** pointer to a function that can free memory allocated using + * reallocFunction or mallocFunction */ + yajl_free_func free; + /** a context pointer that will be passed to above allocation routines */ + void * ctx; +} yajl_alloc_funcs; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_gen.h b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h new file mode 100644 index 000000000..52fa99fc2 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_gen.h + * Interface to YAJL's JSON generation facilities. + */ + +#include <yajl/yajl_common.h> + +#ifndef __YAJL_GEN_H__ +#define __YAJL_GEN_H__ + +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + /** generator status codes */ + typedef enum { + /** no error */ + yajl_gen_status_ok = 0, + /** at a point where a map key is generated, a function other than + * yajl_gen_string was called */ + yajl_gen_keys_must_be_strings, + /** YAJL's maximum generation depth was exceeded. see + * YAJL_MAX_DEPTH */ + yajl_max_depth_exceeded, + /** A generator function (yajl_gen_XXX) was called while in an error + * state */ + yajl_gen_in_error_state, + /** A complete JSON document has been generated */ + yajl_gen_generation_complete, + /** yajl_gen_double was passed an invalid floating point value + * (infinity or NaN). */ + yajl_gen_invalid_number, + /** A print callback was passed in, so there is no internal + * buffer to get from */ + yajl_gen_no_buf, + /** returned from yajl_gen_string() when the yajl_gen_validate_utf8 + * option is enabled and an invalid was passed by client code. + */ + yajl_gen_invalid_string + } yajl_gen_status; + + /** an opaque handle to a generator */ + typedef struct yajl_gen_t * yajl_gen; + + /** a callback used for "printing" the results. */ + typedef void (*yajl_print_t)(void * ctx, + const char * str, + size_t len); + + /** configuration parameters for the parser, these may be passed to + * yajl_gen_config() along with option specific argument(s). In general, + * all configuration parameters default to *off*. */ + typedef enum { + /** generate indented (beautiful) output */ + yajl_gen_beautify = 0x01, + /** + * Set an indent string which is used when yajl_gen_beautify + * is enabled. Maybe something like \\t or some number of + * spaces. The default is four spaces ' '. + */ + yajl_gen_indent_string = 0x02, + /** + * Set a function and context argument that should be used to + * output generated json. the function should conform to the + * yajl_print_t prototype while the context argument is a + * void * of your choosing. + * + * example: + * yajl_gen_config(g, yajl_gen_print_callback, myFunc, myVoidPtr); + */ + yajl_gen_print_callback = 0x04, + /** + * Normally the generator does not validate that strings you + * pass to it via yajl_gen_string() are valid UTF8. Enabling + * this option will cause it to do so. + */ + yajl_gen_validate_utf8 = 0x08, + /** + * the forward solidus (slash or '/' in human) is not required to be + * escaped in json text. By default, YAJL will not escape it in the + * iterest of saving bytes. Setting this flag will cause YAJL to + * always escape '/' in generated JSON strings. + */ + yajl_gen_escape_solidus = 0x10 + } yajl_gen_option; + + /** allow the modification of generator options subsequent to handle + * allocation (via yajl_alloc) + * \returns zero in case of errors, non-zero otherwise + */ + YAJL_API int yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...); + + /** allocate a generator handle + * \param allocFuncs an optional pointer to a structure which allows + * the client to overide the memory allocation + * used by yajl. May be NULL, in which case + * malloc/free/realloc will be used. + * + * \returns an allocated handle on success, NULL on failure (bad params) + */ + YAJL_API yajl_gen yajl_gen_alloc(const yajl_alloc_funcs * allocFuncs); + + /** free a generator handle */ + YAJL_API void yajl_gen_free(yajl_gen handle); + + YAJL_API yajl_gen_status yajl_gen_integer(yajl_gen hand, long long int number); + /** generate a floating point number. number may not be infinity or + * NaN, as these have no representation in JSON. In these cases the + * generator will return 'yajl_gen_invalid_number' */ + YAJL_API yajl_gen_status yajl_gen_double(yajl_gen hand, double number); + YAJL_API yajl_gen_status yajl_gen_number(yajl_gen hand, + const char * num, + size_t len); + YAJL_API yajl_gen_status yajl_gen_string(yajl_gen hand, + const unsigned char * str, + size_t len); + YAJL_API yajl_gen_status yajl_gen_null(yajl_gen hand); + YAJL_API yajl_gen_status yajl_gen_bool(yajl_gen hand, int boolean); + YAJL_API yajl_gen_status yajl_gen_map_open(yajl_gen hand); + YAJL_API yajl_gen_status yajl_gen_map_close(yajl_gen hand); + YAJL_API yajl_gen_status yajl_gen_array_open(yajl_gen hand); + YAJL_API yajl_gen_status yajl_gen_array_close(yajl_gen hand); + + /** access the null terminated generator buffer. If incrementally + * outputing JSON, one should call yajl_gen_clear to clear the + * buffer. This allows stream generation. */ + YAJL_API yajl_gen_status yajl_gen_get_buf(yajl_gen hand, + const unsigned char ** buf, + size_t * len); + + /** clear yajl's output buffer, but maintain all internal generation + * state. This function will not "reset" the generator state, and is + * intended to enable incremental JSON outputing. */ + YAJL_API void yajl_gen_clear(yajl_gen hand); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_parse.h b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h new file mode 100644 index 000000000..55c831101 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_parse.h + * Interface to YAJL's JSON stream parsing facilities. + */ + +#include <yajl/yajl_common.h> + +#ifndef __YAJL_PARSE_H__ +#define __YAJL_PARSE_H__ + +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + /** error codes returned from this interface */ + typedef enum { + /** no error was encountered */ + yajl_status_ok, + /** a client callback returned zero, stopping the parse */ + yajl_status_client_canceled, + /** An error occured during the parse. Call yajl_get_error for + * more information about the encountered error */ + yajl_status_error + } yajl_status; + + /** attain a human readable, english, string for an error */ + YAJL_API const char * yajl_status_to_string(yajl_status code); + + /** an opaque handle to a parser */ + typedef struct yajl_handle_t * yajl_handle; + + /** yajl is an event driven parser. this means as json elements are + * parsed, you are called back to do something with the data. The + * functions in this table indicate the various events for which + * you will be called back. Each callback accepts a "context" + * pointer, this is a void * that is passed into the yajl_parse + * function which the client code may use to pass around context. + * + * All callbacks return an integer. If non-zero, the parse will + * continue. If zero, the parse will be canceled and + * yajl_status_client_canceled will be returned from the parse. + * + * \attention { + * A note about the handling of numbers: + * + * yajl will only convert numbers that can be represented in a + * double or a 64 bit (long long) int. All other numbers will + * be passed to the client in string form using the yajl_number + * callback. Furthermore, if yajl_number is not NULL, it will + * always be used to return numbers, that is yajl_integer and + * yajl_double will be ignored. If yajl_number is NULL but one + * of yajl_integer or yajl_double are defined, parsing of a + * number larger than is representable in a double or 64 bit + * integer will result in a parse error. + * } + */ + typedef struct { + int (* yajl_null)(void * ctx); + int (* yajl_boolean)(void * ctx, int boolVal); + int (* yajl_integer)(void * ctx, long long integerVal); + int (* yajl_double)(void * ctx, double doubleVal); + /** A callback which passes the string representation of the number + * back to the client. Will be used for all numbers when present */ + int (* yajl_number)(void * ctx, const char * numberVal, + size_t numberLen); + + /** strings are returned as pointers into the JSON text when, + * possible, as a result, they are _not_ null padded */ + int (* yajl_string)(void * ctx, const unsigned char * stringVal, + size_t stringLen); + + int (* yajl_start_map)(void * ctx); + int (* yajl_map_key)(void * ctx, const unsigned char * key, + size_t stringLen); + int (* yajl_end_map)(void * ctx); + + int (* yajl_start_array)(void * ctx); + int (* yajl_end_array)(void * ctx); + } yajl_callbacks; + + /** allocate a parser handle + * \param callbacks a yajl callbacks structure specifying the + * functions to call when different JSON entities + * are encountered in the input text. May be NULL, + * which is only useful for validation. + * \param afs memory allocation functions, may be NULL for to use + * C runtime library routines (malloc and friends) + * \param ctx a context pointer that will be passed to callbacks. + */ + YAJL_API yajl_handle yajl_alloc(const yajl_callbacks * callbacks, + yajl_alloc_funcs * afs, + void * ctx); + + + /** configuration parameters for the parser, these may be passed to + * yajl_config() along with option specific argument(s). In general, + * all configuration parameters default to *off*. */ + typedef enum { + /** Ignore javascript style comments present in + * JSON input. Non-standard, but rather fun + * arguments: toggled off with integer zero, on otherwise. + * + * example: + * yajl_config(h, yajl_allow_comments, 1); // turn comment support on + */ + yajl_allow_comments = 0x01, + /** + * When set the parser will verify that all strings in JSON input are + * valid UTF8 and will emit a parse error if this is not so. When set, + * this option makes parsing slightly more expensive (~7% depending + * on processor and compiler in use) + * + * example: + * yajl_config(h, yajl_dont_validate_strings, 1); // disable utf8 checking + */ + yajl_dont_validate_strings = 0x02, + /** + * By default, upon calls to yajl_complete_parse(), yajl will + * ensure the entire input text was consumed and will raise an error + * otherwise. Enabling this flag will cause yajl to disable this + * check. This can be useful when parsing json out of a that contains more + * than a single JSON document. + */ + yajl_allow_trailing_garbage = 0x04, + /** + * Allow multiple values to be parsed by a single handle. The + * entire text must be valid JSON, and values can be seperated + * by any kind of whitespace. This flag will change the + * behavior of the parser, and cause it continue parsing after + * a value is parsed, rather than transitioning into a + * complete state. This option can be useful when parsing multiple + * values from an input stream. + */ + yajl_allow_multiple_values = 0x08, + /** + * When yajl_complete_parse() is called the parser will + * check that the top level value was completely consumed. I.E., + * if called whilst in the middle of parsing a value + * yajl will enter an error state (premature EOF). Setting this + * flag suppresses that check and the corresponding error. + */ + yajl_allow_partial_values = 0x10 + } yajl_option; + + /** allow the modification of parser options subsequent to handle + * allocation (via yajl_alloc) + * \returns zero in case of errors, non-zero otherwise + */ + YAJL_API int yajl_config(yajl_handle h, yajl_option opt, ...); + + /** free a parser handle */ + YAJL_API void yajl_free(yajl_handle handle); + + /** Parse some json! + * \param hand - a handle to the json parser allocated with yajl_alloc + * \param jsonText - a pointer to the UTF8 json text to be parsed + * \param jsonTextLength - the length, in bytes, of input text + */ + YAJL_API yajl_status yajl_parse(yajl_handle hand, + const unsigned char * jsonText, + size_t jsonTextLength); + + /** Parse any remaining buffered json. + * Since yajl is a stream-based parser, without an explicit end of + * input, yajl sometimes can't decide if content at the end of the + * stream is valid or not. For example, if "1" has been fed in, + * yajl can't know whether another digit is next or some character + * that would terminate the integer token. + * + * \param hand - a handle to the json parser allocated with yajl_alloc + */ + YAJL_API yajl_status yajl_complete_parse(yajl_handle hand); + + /** get an error string describing the state of the + * parse. + * + * If verbose is non-zero, the message will include the JSON + * text where the error occured, along with an arrow pointing to + * the specific char. + * + * \returns A dynamically allocated string will be returned which should + * be freed with yajl_free_error + */ + YAJL_API unsigned char * yajl_get_error(yajl_handle hand, int verbose, + const unsigned char * jsonText, + size_t jsonTextLength); + + /** + * get the amount of data consumed from the last chunk passed to YAJL. + * + * In the case of a successful parse this can help you understand if + * the entire buffer was consumed (which will allow you to handle + * "junk at end of input"). + * + * In the event an error is encountered during parsing, this function + * affords the client a way to get the offset into the most recent + * chunk where the error occured. 0 will be returned if no error + * was encountered. + */ + YAJL_API size_t yajl_get_bytes_consumed(yajl_handle hand); + + /** free an error returned from yajl_get_error */ + YAJL_API void yajl_free_error(yajl_handle hand, unsigned char * str); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_tree.h b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h new file mode 100644 index 000000000..8b377f636 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2010-2011 Florian Forster <ff at octo.it> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_tree.h + * + * Parses JSON data and returns the data in tree form. + * + * \author Florian Forster + * \date August 2010 + * + * This interface makes quick parsing and extraction of + * smallish JSON docs trivial: + * + * \include example/parse_config.c + */ + +#ifndef YAJL_TREE_H +#define YAJL_TREE_H 1 + +#include <yajl/yajl_common.h> + +/** possible data types that a yajl_val_s can hold */ +typedef enum { + yajl_t_string = 1, + yajl_t_number = 2, + yajl_t_object = 3, + yajl_t_array = 4, + yajl_t_true = 5, + yajl_t_false = 6, + yajl_t_null = 7, + /** The any type isn't valid for yajl_val_s.type, but can be + * used as an argument to routines like yajl_tree_get(). + */ + yajl_t_any = 8 +} yajl_type; + +#define YAJL_NUMBER_INT_VALID 0x01 +#define YAJL_NUMBER_DOUBLE_VALID 0x02 + +/** A pointer to a node in the parse tree */ +typedef struct yajl_val_s * yajl_val; + +/** + * A JSON value representation capable of holding one of the seven + * types above. For "string", "number", "object", and "array" + * additional data is available in the union. The "YAJL_IS_*" + * and "YAJL_GET_*" macros below allow type checking and convenient + * value extraction. + */ +struct yajl_val_s +{ + /** Type of the value contained. Use the "YAJL_IS_*" macors to check for a + * specific type. */ + yajl_type type; + /** Type-specific data. You may use the "YAJL_GET_*" macros to access these + * members. */ + union + { + char * string; + struct { + long long i; /*< integer value, if representable. */ + double d; /*< double value, if representable. */ + /** Signals whether the \em i and \em d members are + * valid. See \c YAJL_NUMBER_INT_VALID and + * \c YAJL_NUMBER_DOUBLE_VALID. */ + char *r; /*< unparsed number in string form. */ + unsigned int flags; + } number; + struct { + const char **keys; /*< Array of keys */ + yajl_val *values; /*< Array of values. */ + size_t len; /*< Number of key-value-pairs. */ + } object; + struct { + yajl_val *values; /*< Array of elements. */ + size_t len; /*< Number of elements. */ + } array; + } u; +}; + +/** + * Parse a string. + * + * Parses an null-terminated string containing JSON data and returns a pointer + * to the top-level value (root of the parse tree). + * + * \param input Pointer to a null-terminated utf8 string containing + * JSON data. + * \param error_buffer Pointer to a buffer in which an error message will + * be stored if \em yajl_tree_parse fails, or + * \c NULL. The buffer will be initialized before + * parsing, so its content will be destroyed even if + * \em yajl_tree_parse succeeds. + * \param error_buffer_size Size of the memory area pointed to by + * \em error_buffer_size. If \em error_buffer_size is + * \c NULL, this argument is ignored. + * + * \returns Pointer to the top-level value or \c NULL on error. The memory + * pointed to must be freed using \em yajl_tree_free. In case of an error, a + * null terminated message describing the error in more detail is stored in + * \em error_buffer if it is not \c NULL. + */ +YAJL_API yajl_val yajl_tree_parse (const char *input, + char *error_buffer, size_t error_buffer_size); + +/** + * Free a parse tree returned by "yajl_tree_parse". + * + * \param v Pointer to a JSON value returned by "yajl_tree_parse". Passing NULL + * is valid and results in a no-op. + */ +YAJL_API void yajl_tree_free (yajl_val v); + +/** + * Access a nested value inside a tree. + * + * \param parent the node under which you'd like to extract values. + * \param path A null terminated array of strings, each the name of an object key + * \param type the yajl_type of the object you seek, or yajl_t_any if any will do. + * + * \returns a pointer to the found value, or NULL if we came up empty. + * + * Future Ideas: it'd be nice to move path to a string and implement support for + * a teeny tiny micro language here, so you can extract array elements, do things + * like .first and .last, even .length. Inspiration from JSONPath and css selectors? + * No it wouldn't be fast, but that's not what this API is about. + */ +YAJL_API yajl_val yajl_tree_get(yajl_val parent, const char ** path, yajl_type type); + +/* Various convenience macros to check the type of a `yajl_val` */ +#define YAJL_IS_STRING(v) (((v) != NULL) && ((v)->type == yajl_t_string)) +#define YAJL_IS_NUMBER(v) (((v) != NULL) && ((v)->type == yajl_t_number)) +#define YAJL_IS_INTEGER(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_INT_VALID)) +#define YAJL_IS_DOUBLE(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_DOUBLE_VALID)) +#define YAJL_IS_OBJECT(v) (((v) != NULL) && ((v)->type == yajl_t_object)) +#define YAJL_IS_ARRAY(v) (((v) != NULL) && ((v)->type == yajl_t_array )) +#define YAJL_IS_TRUE(v) (((v) != NULL) && ((v)->type == yajl_t_true )) +#define YAJL_IS_FALSE(v) (((v) != NULL) && ((v)->type == yajl_t_false )) +#define YAJL_IS_NULL(v) (((v) != NULL) && ((v)->type == yajl_t_null )) + +/** Given a yajl_val_string return a ptr to the bare string it contains, + * or NULL if the value is not a string. */ +#define YAJL_GET_STRING(v) (YAJL_IS_STRING(v) ? (v)->u.string : NULL) + +/** Get the string representation of a number. You should check type first, + * perhaps using YAJL_IS_NUMBER */ +#define YAJL_GET_NUMBER(v) ((v)->u.number.r) + +/** Get the double representation of a number. You should check type first, + * perhaps using YAJL_IS_DOUBLE */ +#define YAJL_GET_DOUBLE(v) ((v)->u.number.d) + +/** Get the 64bit (long long) integer representation of a number. You should + * check type first, perhaps using YAJL_IS_INTEGER */ +#define YAJL_GET_INTEGER(v) ((v)->u.number.i) + +/** Get a pointer to a yajl_val_object or NULL if the value is not an object. */ +#define YAJL_GET_OBJECT(v) (YAJL_IS_OBJECT(v) ? &(v)->u.object : NULL) + +/** Get a pointer to a yajl_val_array or NULL if the value is not an object. */ +#define YAJL_GET_ARRAY(v) (YAJL_IS_ARRAY(v) ? &(v)->u.array : NULL) + +#endif /* YAJL_TREE_H */ diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_version.h b/xlators/cluster/nsr-server/src/yajl/yajl_version.h new file mode 100644 index 000000000..0fba9b8fc --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_version.h @@ -0,0 +1,23 @@ +#ifndef YAJL_VERSION_H_ +#define YAJL_VERSION_H_ + +#include <yajl/yajl_common.h> + +#define YAJL_MAJOR 2 +#define YAJL_MINOR 0 +#define YAJL_MICRO 1 + +#define YAJL_VERSION ((YAJL_MAJOR * 10000) + (YAJL_MINOR * 100) + YAJL_MICRO) + +#ifdef __cplusplus +extern "C" { +#endif + +extern int YAJL_API yajl_version(void); + +#ifdef __cplusplus +} +#endif + +#endif /* YAJL_VERSION_H_ */ + diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.c b/xlators/cluster/nsr-server/src/yajl_alloc.c new file mode 100644 index 000000000..276315af7 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_alloc.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_alloc.h + * default memory allocation routines for yajl which use malloc/realloc and + * free + */ + +#include "yajl_alloc.h" +#include <stdlib.h> + +static void * yajl_internal_malloc(void *ctx, size_t sz) +{ + return malloc(sz); +} + +static void * yajl_internal_realloc(void *ctx, void * previous, + size_t sz) +{ + return realloc(previous, sz); +} + +static void yajl_internal_free(void *ctx, void * ptr) +{ + free(ptr); +} + +void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf) +{ + yaf->malloc = yajl_internal_malloc; + yaf->free = yajl_internal_free; + yaf->realloc = yajl_internal_realloc; + yaf->ctx = NULL; +} + diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.h b/xlators/cluster/nsr-server/src/yajl_alloc.h new file mode 100644 index 000000000..a8a9e45e6 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_alloc.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_alloc.h + * default memory allocation routines for yajl which use malloc/realloc and + * free + */ + +#ifndef __YAJL_ALLOC_H__ +#define __YAJL_ALLOC_H__ + +#include "yajl/yajl_common.h" + +#define YA_MALLOC(afs, sz) (afs)->malloc((afs)->ctx, (sz)) +#define YA_FREE(afs, ptr) (afs)->free((afs)->ctx, (ptr)) +#define YA_REALLOC(afs, ptr, sz) (afs)->realloc((afs)->ctx, (ptr), (sz)) + +void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf); + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_buf.c b/xlators/cluster/nsr-server/src/yajl_buf.c new file mode 100644 index 000000000..0d249d364 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_buf.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl_buf.h" + +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +#define YAJL_BUF_INIT_SIZE 2048 + +struct yajl_buf_t { + size_t len; + size_t used; + unsigned char * data; + yajl_alloc_funcs * alloc; +}; + +static +void yajl_buf_ensure_available(yajl_buf buf, size_t want) +{ + size_t need; + + assert(buf != NULL); + + /* first call */ + if (buf->data == NULL) { + buf->len = YAJL_BUF_INIT_SIZE; + buf->data = (unsigned char *) YA_MALLOC(buf->alloc, buf->len); + buf->data[0] = 0; + } + + need = buf->len; + + while (want >= (need - buf->used)) need <<= 1; + + if (need != buf->len) { + buf->data = (unsigned char *) YA_REALLOC(buf->alloc, buf->data, need); + buf->len = need; + } +} + +yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc) +{ + yajl_buf b = YA_MALLOC(alloc, sizeof(struct yajl_buf_t)); + memset((void *) b, 0, sizeof(struct yajl_buf_t)); + b->alloc = alloc; + return b; +} + +void yajl_buf_free(yajl_buf buf) +{ + assert(buf != NULL); + if (buf->data) YA_FREE(buf->alloc, buf->data); + YA_FREE(buf->alloc, buf); +} + +void yajl_buf_append(yajl_buf buf, const void * data, size_t len) +{ + yajl_buf_ensure_available(buf, len); + if (len > 0) { + assert(data != NULL); + memcpy(buf->data + buf->used, data, len); + buf->used += len; + buf->data[buf->used] = 0; + } +} + +void yajl_buf_clear(yajl_buf buf) +{ + buf->used = 0; + if (buf->data) buf->data[buf->used] = 0; +} + +const unsigned char * yajl_buf_data(yajl_buf buf) +{ + return buf->data; +} + +size_t yajl_buf_len(yajl_buf buf) +{ + return buf->used; +} + +void +yajl_buf_truncate(yajl_buf buf, size_t len) +{ + assert(len <= buf->used); + buf->used = len; +} diff --git a/xlators/cluster/nsr-server/src/yajl_buf.h b/xlators/cluster/nsr-server/src/yajl_buf.h new file mode 100644 index 000000000..94929a519 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_buf.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_BUF_H__ +#define __YAJL_BUF_H__ + +#include "yajl/yajl_common.h" +#include "yajl_alloc.h" + +/* + * Implementation/performance notes. If this were moved to a header + * only implementation using #define's where possible we might be + * able to sqeeze a little performance out of the guy by killing function + * call overhead. YMMV. + */ + +/** + * yajl_buf is a buffer with exponential growth. the buffer ensures that + * you are always null padded. + */ +typedef struct yajl_buf_t * yajl_buf; + +/* allocate a new buffer */ +yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc); + +/* free the buffer */ +void yajl_buf_free(yajl_buf buf); + +/* append a number of bytes to the buffer */ +void yajl_buf_append(yajl_buf buf, const void * data, size_t len); + +/* empty the buffer */ +void yajl_buf_clear(yajl_buf buf); + +/* get a pointer to the beginning of the buffer */ +const unsigned char * yajl_buf_data(yajl_buf buf); + +/* get the length of the buffer */ +size_t yajl_buf_len(yajl_buf buf); + +/* truncate the buffer */ +void yajl_buf_truncate(yajl_buf buf, size_t len); + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_bytestack.h b/xlators/cluster/nsr-server/src/yajl_bytestack.h new file mode 100644 index 000000000..1fc50c470 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_bytestack.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * A header only implementation of a simple stack of bytes, used in YAJL + * to maintain parse state. + */ + +#ifndef __YAJL_BYTESTACK_H__ +#define __YAJL_BYTESTACK_H__ + +#include "yajl/yajl_common.h" + +#define YAJL_BS_INC 128 + +typedef struct yajl_bytestack_t +{ + unsigned char * stack; + size_t size; + size_t used; + yajl_alloc_funcs * yaf; +} yajl_bytestack; + +/* initialize a bytestack */ +#define yajl_bs_init(obs, _yaf) { \ + (obs).stack = NULL; \ + (obs).size = 0; \ + (obs).used = 0; \ + (obs).yaf = (_yaf); \ + } \ + + +/* initialize a bytestack */ +#define yajl_bs_free(obs) \ + if ((obs).stack) (obs).yaf->free((obs).yaf->ctx, (obs).stack); + +#define yajl_bs_current(obs) \ + (assert((obs).used > 0), (obs).stack[(obs).used - 1]) + +#define yajl_bs_push(obs, byte) { \ + if (((obs).size - (obs).used) == 0) { \ + (obs).size += YAJL_BS_INC; \ + (obs).stack = (obs).yaf->realloc((obs).yaf->ctx,\ + (void *) (obs).stack, (obs).size);\ + } \ + (obs).stack[((obs).used)++] = (byte); \ +} + +/* removes the top item of the stack, returns nothing */ +#define yajl_bs_pop(obs) { ((obs).used)--; } + +#define yajl_bs_set(obs, byte) \ + (obs).stack[((obs).used) - 1] = (byte); + + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_encode.c b/xlators/cluster/nsr-server/src/yajl_encode.c new file mode 100644 index 000000000..9dc9a3e81 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_encode.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl_encode.h" + +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +static void CharToHex(unsigned char c, char * hexBuf) +{ + const char * hexchar = "0123456789ABCDEF"; + hexBuf[0] = hexchar[c >> 4]; + hexBuf[1] = hexchar[c & 0x0F]; +} + +void +yajl_string_encode(const yajl_print_t print, + void * ctx, + const unsigned char * str, + size_t len, + int escape_solidus) +{ + size_t beg = 0; + size_t end = 0; + char hexBuf[7]; + hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0'; + hexBuf[6] = 0; + + while (end < len) { + const char * escaped = NULL; + switch (str[end]) { + case '\r': escaped = "\\r"; break; + case '\n': escaped = "\\n"; break; + case '\\': escaped = "\\\\"; break; + /* it is not required to escape a solidus in JSON: + * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt + * specifically, this production from the grammar: + * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF + */ + case '/': if (escape_solidus) escaped = "\\/"; break; + case '"': escaped = "\\\""; break; + case '\f': escaped = "\\f"; break; + case '\b': escaped = "\\b"; break; + case '\t': escaped = "\\t"; break; + default: + if ((unsigned char) str[end] < 32) { + CharToHex(str[end], hexBuf + 4); + escaped = hexBuf; + } + break; + } + if (escaped != NULL) { + print(ctx, (const char *) (str + beg), end - beg); + print(ctx, escaped, (unsigned int)strlen(escaped)); + beg = ++end; + } else { + ++end; + } + } + print(ctx, (const char *) (str + beg), end - beg); +} + +static void hexToDigit(unsigned int * val, const unsigned char * hex) +{ + unsigned int i; + for (i=0;i<4;i++) { + unsigned char c = hex[i]; + if (c >= 'A') c = (c & ~0x20) - 7; + c -= '0'; + assert(!(c & 0xF0)); + *val = (*val << 4) | c; + } +} + +static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf) +{ + if (codepoint < 0x80) { + utf8Buf[0] = (char) codepoint; + utf8Buf[1] = 0; + } else if (codepoint < 0x0800) { + utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0); + utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80); + utf8Buf[2] = 0; + } else if (codepoint < 0x10000) { + utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0); + utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80); + utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80); + utf8Buf[3] = 0; + } else if (codepoint < 0x200000) { + utf8Buf[0] =(char)((codepoint >> 18) | 0xF0); + utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80); + utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80); + utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80); + utf8Buf[4] = 0; + } else { + utf8Buf[0] = '?'; + utf8Buf[1] = 0; + } +} + +void yajl_string_decode(yajl_buf buf, const unsigned char * str, + size_t len) +{ + size_t beg = 0; + size_t end = 0; + + while (end < len) { + if (str[end] == '\\') { + char utf8Buf[5]; + const char * unescaped = "?"; + yajl_buf_append(buf, str + beg, end - beg); + switch (str[++end]) { + case 'r': unescaped = "\r"; break; + case 'n': unescaped = "\n"; break; + case '\\': unescaped = "\\"; break; + case '/': unescaped = "/"; break; + case '"': unescaped = "\""; break; + case 'f': unescaped = "\f"; break; + case 'b': unescaped = "\b"; break; + case 't': unescaped = "\t"; break; + case 'u': { + unsigned int codepoint = 0; + hexToDigit(&codepoint, str + ++end); + end+=3; + /* check if this is a surrogate */ + if ((codepoint & 0xFC00) == 0xD800) { + end++; + if (str[end] == '\\' && str[end + 1] == 'u') { + unsigned int surrogate = 0; + hexToDigit(&surrogate, str + end + 2); + codepoint = + (((codepoint & 0x3F) << 10) | + ((((codepoint >> 6) & 0xF) + 1) << 16) | + (surrogate & 0x3FF)); + end += 5; + } else { + unescaped = "?"; + break; + } + } + + Utf32toUtf8(codepoint, utf8Buf); + unescaped = utf8Buf; + + if (codepoint == 0) { + yajl_buf_append(buf, unescaped, 1); + beg = ++end; + continue; + } + + break; + } + default: + assert("this should never happen" == NULL); + } + yajl_buf_append(buf, unescaped, (unsigned int)strlen(unescaped)); + beg = ++end; + } else { + end++; + } + } + yajl_buf_append(buf, str + beg, end - beg); +} + +#define ADV_PTR s++; if (!(len--)) return 0; + +int yajl_string_validate_utf8(const unsigned char * s, size_t len) +{ + if (!len) return 1; + if (!s) return 0; + + while (len--) { + /* single byte */ + if (*s <= 0x7f) { + /* noop */ + } + /* two byte */ + else if ((*s >> 5) == 0x6) { + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + } + /* three byte */ + else if ((*s >> 4) == 0x0e) { + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + } + /* four byte */ + else if ((*s >> 3) == 0x1e) { + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + } else { + return 0; + } + + s++; + } + + return 1; +} diff --git a/xlators/cluster/nsr-server/src/yajl_encode.h b/xlators/cluster/nsr-server/src/yajl_encode.h new file mode 100644 index 000000000..af1e8bbde --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_encode.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_ENCODE_H__ +#define __YAJL_ENCODE_H__ + +#include "yajl_buf.h" +#include "yajl/yajl_gen.h" + +void yajl_string_encode(const yajl_print_t printer, + void * ctx, + const unsigned char * str, + size_t length, + int escape_solidus); + +void yajl_string_decode(yajl_buf buf, const unsigned char * str, + size_t length); + +int yajl_string_validate_utf8(const unsigned char * s, size_t len); + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_gen.c b/xlators/cluster/nsr-server/src/yajl_gen.c new file mode 100644 index 000000000..73763a9e0 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_gen.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl/yajl_gen.h" +#include "yajl_buf.h" +#include "yajl_encode.h" + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <math.h> +#include <stdarg.h> + +typedef enum { + yajl_gen_start, + yajl_gen_map_start, + yajl_gen_map_key, + yajl_gen_map_val, + yajl_gen_array_start, + yajl_gen_in_array, + yajl_gen_complete, + yajl_gen_error +} yajl_gen_state; + +struct yajl_gen_t +{ + unsigned int flags; + unsigned int depth; + const char * indentString; + yajl_gen_state state[YAJL_MAX_DEPTH]; + yajl_print_t print; + void * ctx; /* yajl_buf */ + /* memory allocation routines */ + yajl_alloc_funcs alloc; +}; + +int +yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...) +{ + int rv = 1; + va_list ap; + va_start(ap, opt); + + switch(opt) { + case yajl_gen_beautify: + case yajl_gen_validate_utf8: + if (va_arg(ap, int)) g->flags |= opt; + else g->flags &= ~opt; + break; + case yajl_gen_indent_string: { + const char *indent = va_arg(ap, const char *); + g->indentString = indent; + for (; *indent; indent++) { + if (*indent != '\n' + && *indent != '\v' + && *indent != '\f' + && *indent != '\t' + && *indent != '\r' + && *indent != ' ') + { + g->indentString = NULL; + rv = 0; + } + } + break; + } + case yajl_gen_print_callback: + yajl_buf_free(g->ctx); + g->print = va_arg(ap, const yajl_print_t); + g->ctx = va_arg(ap, void *); + break; + default: + rv = 0; + } + + va_end(ap); + + return rv; +} + + + +yajl_gen +yajl_gen_alloc(const yajl_alloc_funcs * afs) +{ + yajl_gen g = NULL; + yajl_alloc_funcs afsBuffer; + + /* first order of business is to set up memory allocation routines */ + if (afs != NULL) { + if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL) + { + return NULL; + } + } else { + yajl_set_default_alloc_funcs(&afsBuffer); + afs = &afsBuffer; + } + + g = (yajl_gen) YA_MALLOC(afs, sizeof(struct yajl_gen_t)); + if (!g) return NULL; + + memset((void *) g, 0, sizeof(struct yajl_gen_t)); + /* copy in pointers to allocation routines */ + memcpy((void *) &(g->alloc), (void *) afs, sizeof(yajl_alloc_funcs)); + + g->print = (yajl_print_t)&yajl_buf_append; + g->ctx = yajl_buf_alloc(&(g->alloc)); + g->indentString = " "; + + return g; +} + +void +yajl_gen_free(yajl_gen g) +{ + if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_free((yajl_buf)g->ctx); + YA_FREE(&(g->alloc), g); +} + +#define INSERT_SEP \ + if (g->state[g->depth] == yajl_gen_map_key || \ + g->state[g->depth] == yajl_gen_in_array) { \ + g->print(g->ctx, ",", 1); \ + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); \ + } else if (g->state[g->depth] == yajl_gen_map_val) { \ + g->print(g->ctx, ":", 1); \ + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, " ", 1); \ + } + +#define INSERT_WHITESPACE \ + if ((g->flags & yajl_gen_beautify)) { \ + if (g->state[g->depth] != yajl_gen_map_val) { \ + unsigned int _i; \ + for (_i=0;_i<g->depth;_i++) \ + g->print(g->ctx, \ + g->indentString, \ + (unsigned int)strlen(g->indentString)); \ + } \ + } + +#define ENSURE_NOT_KEY \ + if (g->state[g->depth] == yajl_gen_map_key || \ + g->state[g->depth] == yajl_gen_map_start) { \ + return yajl_gen_keys_must_be_strings; \ + } \ + +/* check that we're not complete, or in error state. in a valid state + * to be generating */ +#define ENSURE_VALID_STATE \ + if (g->state[g->depth] == yajl_gen_error) { \ + return yajl_gen_in_error_state;\ + } else if (g->state[g->depth] == yajl_gen_complete) { \ + return yajl_gen_generation_complete; \ + } + +#define INCREMENT_DEPTH \ + if (++(g->depth) >= YAJL_MAX_DEPTH) return yajl_max_depth_exceeded; + +#define DECREMENT_DEPTH \ + if (--(g->depth) >= YAJL_MAX_DEPTH) return yajl_gen_error; + +#define APPENDED_ATOM \ + switch (g->state[g->depth]) { \ + case yajl_gen_start: \ + g->state[g->depth] = yajl_gen_complete; \ + break; \ + case yajl_gen_map_start: \ + case yajl_gen_map_key: \ + g->state[g->depth] = yajl_gen_map_val; \ + break; \ + case yajl_gen_array_start: \ + g->state[g->depth] = yajl_gen_in_array; \ + break; \ + case yajl_gen_map_val: \ + g->state[g->depth] = yajl_gen_map_key; \ + break; \ + default: \ + break; \ + } \ + +#define FINAL_NEWLINE \ + if ((g->flags & yajl_gen_beautify) && g->state[g->depth] == yajl_gen_complete) \ + g->print(g->ctx, "\n", 1); + +yajl_gen_status +yajl_gen_integer(yajl_gen g, long long int number) +{ + char i[32]; + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + sprintf(i, "%lld", number); + g->print(g->ctx, i, (unsigned int)strlen(i)); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +#ifdef WIN32 +#include <float.h> +#define isnan _isnan +#define isinf !_finite +#endif + +yajl_gen_status +yajl_gen_double(yajl_gen g, double number) +{ + char i[32]; + ENSURE_VALID_STATE; ENSURE_NOT_KEY; + if (isnan(number) || isinf(number)) return yajl_gen_invalid_number; + INSERT_SEP; INSERT_WHITESPACE; + sprintf(i, "%.20g", number); + g->print(g->ctx, i, (unsigned int)strlen(i)); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_number(yajl_gen g, const char * s, size_t l) +{ + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + g->print(g->ctx, s, l); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_string(yajl_gen g, const unsigned char * str, + size_t len) +{ + // if validation is enabled, check that the string is valid utf8 + // XXX: This checking could be done a little faster, in the same pass as + // the string encoding + if (g->flags & yajl_gen_validate_utf8) { + if (!yajl_string_validate_utf8(str, len)) { + return yajl_gen_invalid_string; + } + } + ENSURE_VALID_STATE; INSERT_SEP; INSERT_WHITESPACE; + g->print(g->ctx, "\"", 1); + yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus); + g->print(g->ctx, "\"", 1); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_null(yajl_gen g) +{ + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + g->print(g->ctx, "null", strlen("null")); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_bool(yajl_gen g, int boolean) +{ + const char * val = boolean ? "true" : "false"; + + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + g->print(g->ctx, val, (unsigned int)strlen(val)); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_map_open(yajl_gen g) +{ + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + INCREMENT_DEPTH; + + g->state[g->depth] = yajl_gen_map_start; + g->print(g->ctx, "{", 1); + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_map_close(yajl_gen g) +{ + ENSURE_VALID_STATE; + DECREMENT_DEPTH; + + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); + APPENDED_ATOM; + INSERT_WHITESPACE; + g->print(g->ctx, "}", 1); + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_array_open(yajl_gen g) +{ + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + INCREMENT_DEPTH; + g->state[g->depth] = yajl_gen_array_start; + g->print(g->ctx, "[", 1); + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_array_close(yajl_gen g) +{ + ENSURE_VALID_STATE; + DECREMENT_DEPTH; + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); + APPENDED_ATOM; + INSERT_WHITESPACE; + g->print(g->ctx, "]", 1); + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_get_buf(yajl_gen g, const unsigned char ** buf, + size_t * len) +{ + if (g->print != (yajl_print_t)&yajl_buf_append) return yajl_gen_no_buf; + *buf = yajl_buf_data((yajl_buf)g->ctx); + *len = yajl_buf_len((yajl_buf)g->ctx); + return yajl_gen_status_ok; +} + +void +yajl_gen_clear(yajl_gen g) +{ + if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_clear((yajl_buf)g->ctx); +} diff --git a/xlators/cluster/nsr-server/src/yajl_lex.c b/xlators/cluster/nsr-server/src/yajl_lex.c new file mode 100644 index 000000000..b098e6a99 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_lex.c @@ -0,0 +1,763 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl_lex.h" +#include "yajl_buf.h" + +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include <string.h> + +#ifdef YAJL_LEXER_DEBUG +static const char * +tokToStr(yajl_tok tok) +{ + switch (tok) { + case yajl_tok_bool: return "bool"; + case yajl_tok_colon: return "colon"; + case yajl_tok_comma: return "comma"; + case yajl_tok_eof: return "eof"; + case yajl_tok_error: return "error"; + case yajl_tok_left_brace: return "brace"; + case yajl_tok_left_bracket: return "bracket"; + case yajl_tok_null: return "null"; + case yajl_tok_integer: return "integer"; + case yajl_tok_double: return "double"; + case yajl_tok_right_brace: return "brace"; + case yajl_tok_right_bracket: return "bracket"; + case yajl_tok_string: return "string"; + case yajl_tok_string_with_escapes: return "string_with_escapes"; + } + return "unknown"; +} +#endif + +/* Impact of the stream parsing feature on the lexer: + * + * YAJL support stream parsing. That is, the ability to parse the first + * bits of a chunk of JSON before the last bits are available (still on + * the network or disk). This makes the lexer more complex. The + * responsibility of the lexer is to handle transparently the case where + * a chunk boundary falls in the middle of a token. This is + * accomplished is via a buffer and a character reading abstraction. + * + * Overview of implementation + * + * When we lex to end of input string before end of token is hit, we + * copy all of the input text composing the token into our lexBuf. + * + * Every time we read a character, we do so through the readChar function. + * readChar's responsibility is to handle pulling all chars from the buffer + * before pulling chars from input text + */ + +struct yajl_lexer_t { + /* the overal line and char offset into the data */ + size_t lineOff; + size_t charOff; + + /* error */ + yajl_lex_error error; + + /* a input buffer to handle the case where a token is spread over + * multiple chunks */ + yajl_buf buf; + + /* in the case where we have data in the lexBuf, bufOff holds + * the current offset into the lexBuf. */ + size_t bufOff; + + /* are we using the lex buf? */ + unsigned int bufInUse; + + /* shall we allow comments? */ + unsigned int allowComments; + + /* shall we validate utf8 inside strings? */ + unsigned int validateUTF8; + + yajl_alloc_funcs * alloc; +}; + +#define readChar(lxr, txt, off) \ + (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \ + (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \ + ((txt)[(*(off))++])) + +#define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--)) + +yajl_lexer +yajl_lex_alloc(yajl_alloc_funcs * alloc, + unsigned int allowComments, unsigned int validateUTF8) +{ + yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t)); + memset((void *) lxr, 0, sizeof(struct yajl_lexer_t)); + lxr->buf = yajl_buf_alloc(alloc); + lxr->allowComments = allowComments; + lxr->validateUTF8 = validateUTF8; + lxr->alloc = alloc; + return lxr; +} + +void +yajl_lex_free(yajl_lexer lxr) +{ + yajl_buf_free(lxr->buf); + YA_FREE(lxr->alloc, lxr); + return; +} + +/* a lookup table which lets us quickly determine three things: + * VEC - valid escaped control char + * note. the solidus '/' may be escaped or not. + * IJC - invalid json char + * VHC - valid hex char + * NFP - needs further processing (from a string scanning perspective) + * NUC - needs utf8 checking when enabled (from a string scanning perspective) + */ +#define VEC 0x01 +#define IJC 0x02 +#define VHC 0x04 +#define NFP 0x08 +#define NUC 0x10 + +static const char charLookupTable[256] = +{ +/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , +/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , +/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , +/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , + +/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 , +/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC , +/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC , +/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 , + +/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 , +/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , +/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , +/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , + +/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 , +/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 , +/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 , +/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC +}; + +/** process a variable length utf8 encoded codepoint. + * + * returns: + * yajl_tok_string - if valid utf8 char was parsed and offset was + * advanced + * yajl_tok_eof - if end of input was hit before validation could + * complete + * yajl_tok_error - if invalid utf8 was encountered + * + * NOTE: on error the offset will point to the first char of the + * invalid utf8 */ +#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; } + +static yajl_tok +yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset, + unsigned char curChar) +{ + if (curChar <= 0x7f) { + /* single byte */ + return yajl_tok_string; + } else if ((curChar >> 5) == 0x6) { + /* two byte */ + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) return yajl_tok_string; + } else if ((curChar >> 4) == 0x0e) { + /* three byte */ + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) { + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) return yajl_tok_string; + } + } else if ((curChar >> 3) == 0x1e) { + /* four byte */ + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) { + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) { + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) return yajl_tok_string; + } + } + } + + return yajl_tok_error; +} + +/* lex a string. input is the lexer, pointer to beginning of + * json text, and start of string (offset). + * a token is returned which has the following meanings: + * yajl_tok_string: lex of string was successful. offset points to + * terminating '"'. + * yajl_tok_eof: end of text was encountered before we could complete + * the lex. + * yajl_tok_error: embedded in the string were unallowable chars. offset + * points to the offending char + */ +#define STR_CHECK_EOF \ +if (*offset >= jsonTextLen) { \ + tok = yajl_tok_eof; \ + goto finish_string_lex; \ +} + +/** scan a string for interesting characters that might need further + * review. return the number of chars that are uninteresting and can + * be skipped. + * (lth) hi world, any thoughts on how to make this routine faster? */ +static size_t +yajl_string_scan(const unsigned char * buf, size_t len, int utf8check) +{ + unsigned char mask = IJC|NFP|(utf8check ? NUC : 0); + size_t skip = 0; + while (skip < len && !(charLookupTable[*buf] & mask)) + { + skip++; + buf++; + } + return skip; +} + +static yajl_tok +yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset) +{ + yajl_tok tok = yajl_tok_error; + int hasEscapes = 0; + + for (;;) { + unsigned char curChar; + + /* now jump into a faster scanning routine to skip as much + * of the buffers as possible */ + { + const unsigned char * p; + size_t len; + + if ((lexer->bufInUse && yajl_buf_len(lexer->buf) && + lexer->bufOff < yajl_buf_len(lexer->buf))) + { + p = ((const unsigned char *) yajl_buf_data(lexer->buf) + + (lexer->bufOff)); + len = yajl_buf_len(lexer->buf) - lexer->bufOff; + lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8); + } + else if (*offset < jsonTextLen) + { + p = jsonText + *offset; + len = jsonTextLen - *offset; + *offset += yajl_string_scan(p, len, lexer->validateUTF8); + } + } + + STR_CHECK_EOF; + + curChar = readChar(lexer, jsonText, offset); + + /* quote terminates */ + if (curChar == '"') { + tok = yajl_tok_string; + break; + } + /* backslash escapes a set of control chars, */ + else if (curChar == '\\') { + hasEscapes = 1; + STR_CHECK_EOF; + + /* special case \u */ + curChar = readChar(lexer, jsonText, offset); + if (curChar == 'u') { + unsigned int i = 0; + + for (i=0;i<4;i++) { + STR_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if (!(charLookupTable[curChar] & VHC)) { + /* back up to offending char */ + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_hex_char; + goto finish_string_lex; + } + } + } else if (!(charLookupTable[curChar] & VEC)) { + /* back up to offending char */ + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_escaped_char; + goto finish_string_lex; + } + } + /* when not validating UTF8 it's a simple table lookup to determine + * if the present character is invalid */ + else if(charLookupTable[curChar] & IJC) { + /* back up to offending char */ + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_json_char; + goto finish_string_lex; + } + /* when in validate UTF8 mode we need to do some extra work */ + else if (lexer->validateUTF8) { + yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen, + offset, curChar); + + if (t == yajl_tok_eof) { + tok = yajl_tok_eof; + goto finish_string_lex; + } else if (t == yajl_tok_error) { + lexer->error = yajl_lex_string_invalid_utf8; + goto finish_string_lex; + } + } + /* accept it, and move on */ + } + finish_string_lex: + /* tell our buddy, the parser, wether he needs to process this string + * again */ + if (hasEscapes && tok == yajl_tok_string) { + tok = yajl_tok_string_with_escapes; + } + + return tok; +} + +#define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof; + +static yajl_tok +yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset) +{ + /** XXX: numbers are the only entities in json that we must lex + * _beyond_ in order to know that they are complete. There + * is an ambiguous case for integers at EOF. */ + + unsigned char c; + + yajl_tok tok = yajl_tok_integer; + + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + + /* optional leading minus */ + if (c == '-') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } + + /* a single zero, or a series of integers */ + if (c == '0') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } else if (c >= '1' && c <= '9') { + do { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } while (c >= '0' && c <= '9'); + } else { + unreadChar(lexer, offset); + lexer->error = yajl_lex_missing_integer_after_minus; + return yajl_tok_error; + } + + /* optional fraction (indicates this is floating point) */ + if (c == '.') { + int numRd = 0; + + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + + while (c >= '0' && c <= '9') { + numRd++; + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } + + if (!numRd) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_missing_integer_after_decimal; + return yajl_tok_error; + } + tok = yajl_tok_double; + } + + /* optional exponent (indicates this is floating point) */ + if (c == 'e' || c == 'E') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + + /* optional sign */ + if (c == '+' || c == '-') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } + + if (c >= '0' && c <= '9') { + do { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } while (c >= '0' && c <= '9'); + } else { + unreadChar(lexer, offset); + lexer->error = yajl_lex_missing_integer_after_exponent; + return yajl_tok_error; + } + tok = yajl_tok_double; + } + + /* we always go "one too far" */ + unreadChar(lexer, offset); + + return tok; +} + +static yajl_tok +yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset) +{ + unsigned char c; + + yajl_tok tok = yajl_tok_comment; + + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + + /* either slash or star expected */ + if (c == '/') { + /* now we throw away until end of line */ + do { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } while (c != '\n'); + } else if (c == '*') { + /* now we throw away until end of comment */ + for (;;) { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + if (c == '*') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + if (c == '/') { + break; + } else { + unreadChar(lexer, offset); + } + } + } + } else { + lexer->error = yajl_lex_invalid_char; + tok = yajl_tok_error; + } + + return tok; +} + +yajl_tok +yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset, + const unsigned char ** outBuf, size_t * outLen) +{ + yajl_tok tok = yajl_tok_error; + unsigned char c; + size_t startOffset = *offset; + + *outBuf = NULL; + *outLen = 0; + + for (;;) { + assert(*offset <= jsonTextLen); + + if (*offset >= jsonTextLen) { + tok = yajl_tok_eof; + goto lexed; + } + + c = readChar(lexer, jsonText, offset); + + switch (c) { + case '{': + tok = yajl_tok_left_bracket; + goto lexed; + case '}': + tok = yajl_tok_right_bracket; + goto lexed; + case '[': + tok = yajl_tok_left_brace; + goto lexed; + case ']': + tok = yajl_tok_right_brace; + goto lexed; + case ',': + tok = yajl_tok_comma; + goto lexed; + case ':': + tok = yajl_tok_colon; + goto lexed; + case '\t': case '\n': case '\v': case '\f': case '\r': case ' ': + startOffset++; + break; + case 't': { + const char * want = "rue"; + do { + if (*offset >= jsonTextLen) { + tok = yajl_tok_eof; + goto lexed; + } + c = readChar(lexer, jsonText, offset); + if (c != *want) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_invalid_string; + tok = yajl_tok_error; + goto lexed; + } + } while (*(++want)); + tok = yajl_tok_bool; + goto lexed; + } + case 'f': { + const char * want = "alse"; + do { + if (*offset >= jsonTextLen) { + tok = yajl_tok_eof; + goto lexed; + } + c = readChar(lexer, jsonText, offset); + if (c != *want) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_invalid_string; + tok = yajl_tok_error; + goto lexed; + } + } while (*(++want)); + tok = yajl_tok_bool; + goto lexed; + } + case 'n': { + const char * want = "ull"; + do { + if (*offset >= jsonTextLen) { + tok = yajl_tok_eof; + goto lexed; + } + c = readChar(lexer, jsonText, offset); + if (c != *want) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_invalid_string; + tok = yajl_tok_error; + goto lexed; + } + } while (*(++want)); + tok = yajl_tok_null; + goto lexed; + } + case '"': { + tok = yajl_lex_string(lexer, (const unsigned char *) jsonText, + jsonTextLen, offset); + goto lexed; + } + case '-': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + /* integer parsing wants to start from the beginning */ + unreadChar(lexer, offset); + tok = yajl_lex_number(lexer, (const unsigned char *) jsonText, + jsonTextLen, offset); + goto lexed; + } + case '/': + /* hey, look, a probable comment! If comments are disabled + * it's an error. */ + if (!lexer->allowComments) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_unallowed_comment; + tok = yajl_tok_error; + goto lexed; + } + /* if comments are enabled, then we should try to lex + * the thing. possible outcomes are + * - successful lex (tok_comment, which means continue), + * - malformed comment opening (slash not followed by + * '*' or '/') (tok_error) + * - eof hit. (tok_eof) */ + tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText, + jsonTextLen, offset); + if (tok == yajl_tok_comment) { + /* "error" is silly, but that's the initial + * state of tok. guilty until proven innocent. */ + tok = yajl_tok_error; + yajl_buf_clear(lexer->buf); + lexer->bufInUse = 0; + startOffset = *offset; + break; + } + /* hit error or eof, bail */ + goto lexed; + default: + lexer->error = yajl_lex_invalid_char; + tok = yajl_tok_error; + goto lexed; + } + } + + + lexed: + /* need to append to buffer if the buffer is in use or + * if it's an EOF token */ + if (tok == yajl_tok_eof || lexer->bufInUse) { + if (!lexer->bufInUse) yajl_buf_clear(lexer->buf); + lexer->bufInUse = 1; + yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset); + lexer->bufOff = 0; + + if (tok != yajl_tok_eof) { + *outBuf = yajl_buf_data(lexer->buf); + *outLen = yajl_buf_len(lexer->buf); + lexer->bufInUse = 0; + } + } else if (tok != yajl_tok_error) { + *outBuf = jsonText + startOffset; + *outLen = *offset - startOffset; + } + + /* special case for strings. skip the quotes. */ + if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes) + { + assert(*outLen >= 2); + (*outBuf)++; + *outLen -= 2; + } + + +#ifdef YAJL_LEXER_DEBUG + if (tok == yajl_tok_error) { + printf("lexical error: %s\n", + yajl_lex_error_to_string(yajl_lex_get_error(lexer))); + } else if (tok == yajl_tok_eof) { + printf("EOF hit\n"); + } else { + printf("lexed %s: '", tokToStr(tok)); + fwrite(*outBuf, 1, *outLen, stdout); + printf("'\n"); + } +#endif + + return tok; +} + +const char * +yajl_lex_error_to_string(yajl_lex_error error) +{ + switch (error) { + case yajl_lex_e_ok: + return "ok, no error"; + case yajl_lex_string_invalid_utf8: + return "invalid bytes in UTF8 string."; + case yajl_lex_string_invalid_escaped_char: + return "inside a string, '\\' occurs before a character " + "which it may not."; + case yajl_lex_string_invalid_json_char: + return "invalid character inside string."; + case yajl_lex_string_invalid_hex_char: + return "invalid (non-hex) character occurs after '\\u' inside " + "string."; + case yajl_lex_invalid_char: + return "invalid char in json text."; + case yajl_lex_invalid_string: + return "invalid string in json text."; + case yajl_lex_missing_integer_after_exponent: + return "malformed number, a digit is required after the exponent."; + case yajl_lex_missing_integer_after_decimal: + return "malformed number, a digit is required after the " + "decimal point."; + case yajl_lex_missing_integer_after_minus: + return "malformed number, a digit is required after the " + "minus sign."; + case yajl_lex_unallowed_comment: + return "probable comment found in input text, comments are " + "not enabled."; + } + return "unknown error code"; +} + + +/** allows access to more specific information about the lexical + * error when yajl_lex_lex returns yajl_tok_error. */ +yajl_lex_error +yajl_lex_get_error(yajl_lexer lexer) +{ + if (lexer == NULL) return (yajl_lex_error) -1; + return lexer->error; +} + +size_t yajl_lex_current_line(yajl_lexer lexer) +{ + return lexer->lineOff; +} + +size_t yajl_lex_current_char(yajl_lexer lexer) +{ + return lexer->charOff; +} + +yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t offset) +{ + const unsigned char * outBuf; + size_t outLen; + size_t bufLen = yajl_buf_len(lexer->buf); + size_t bufOff = lexer->bufOff; + unsigned int bufInUse = lexer->bufInUse; + yajl_tok tok; + + tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset, + &outBuf, &outLen); + + lexer->bufOff = bufOff; + lexer->bufInUse = bufInUse; + yajl_buf_truncate(lexer->buf, bufLen); + + return tok; +} diff --git a/xlators/cluster/nsr-server/src/yajl_lex.h b/xlators/cluster/nsr-server/src/yajl_lex.h new file mode 100644 index 000000000..cbaae0c13 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_lex.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_LEX_H__ +#define __YAJL_LEX_H__ + +#include "yajl/yajl_common.h" + +typedef enum { + yajl_tok_bool, + yajl_tok_colon, + yajl_tok_comma, + yajl_tok_eof, + yajl_tok_error, + yajl_tok_left_brace, + yajl_tok_left_bracket, + yajl_tok_null, + yajl_tok_right_brace, + yajl_tok_right_bracket, + + /* we differentiate between integers and doubles to allow the + * parser to interpret the number without re-scanning */ + yajl_tok_integer, + yajl_tok_double, + + /* we differentiate between strings which require further processing, + * and strings that do not */ + yajl_tok_string, + yajl_tok_string_with_escapes, + + /* comment tokens are not currently returned to the parser, ever */ + yajl_tok_comment +} yajl_tok; + +typedef struct yajl_lexer_t * yajl_lexer; + +yajl_lexer yajl_lex_alloc(yajl_alloc_funcs * alloc, + unsigned int allowComments, + unsigned int validateUTF8); + +void yajl_lex_free(yajl_lexer lexer); + +/** + * run/continue a lex. "offset" is an input/output parameter. + * It should be initialized to zero for a + * new chunk of target text, and upon subsetquent calls with the same + * target text should passed with the value of the previous invocation. + * + * the client may be interested in the value of offset when an error is + * returned from the lexer. This allows the client to render useful +n * error messages. + * + * When you pass the next chunk of data, context should be reinitialized + * to zero. + * + * Finally, the output buffer is usually just a pointer into the jsonText, + * however in cases where the entity being lexed spans multiple chunks, + * the lexer will buffer the entity and the data returned will be + * a pointer into that buffer. + * + * This behavior is abstracted from client code except for the performance + * implications which require that the client choose a reasonable chunk + * size to get adequate performance. + */ +yajl_tok yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset, + const unsigned char ** outBuf, size_t * outLen); + +/** have a peek at the next token, but don't move the lexer forward */ +yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t offset); + + +typedef enum { + yajl_lex_e_ok = 0, + yajl_lex_string_invalid_utf8, + yajl_lex_string_invalid_escaped_char, + yajl_lex_string_invalid_json_char, + yajl_lex_string_invalid_hex_char, + yajl_lex_invalid_char, + yajl_lex_invalid_string, + yajl_lex_missing_integer_after_decimal, + yajl_lex_missing_integer_after_exponent, + yajl_lex_missing_integer_after_minus, + yajl_lex_unallowed_comment +} yajl_lex_error; + +const char * yajl_lex_error_to_string(yajl_lex_error error); + +/** allows access to more specific information about the lexical + * error when yajl_lex_lex returns yajl_tok_error. */ +yajl_lex_error yajl_lex_get_error(yajl_lexer lexer); + +/** get the current offset into the most recently lexed json string. */ +size_t yajl_lex_current_offset(yajl_lexer lexer); + +/** get the number of lines lexed by this lexer instance */ +size_t yajl_lex_current_line(yajl_lexer lexer); + +/** get the number of chars lexed by this lexer instance since the last + * \n or \r */ +size_t yajl_lex_current_char(yajl_lexer lexer); + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_parser.c b/xlators/cluster/nsr-server/src/yajl_parser.c new file mode 100644 index 000000000..bf9ef24ef --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_parser.c @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl/yajl_parse.h" +#include "yajl_lex.h" +#include "yajl_parser.h" +#include "yajl_encode.h" +#include "yajl_bytestack.h" + +#include <stdlib.h> +#include <limits.h> +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <assert.h> +#include <math.h> + +#define MAX_VALUE_TO_MULTIPLY ((LLONG_MAX / 10) + (LLONG_MAX % 10)) + + /* same semantics as strtol */ +long long +yajl_parse_integer(const unsigned char *number, unsigned int length) +{ + long long ret = 0; + long sign = 1; + const unsigned char *pos = number; + if (*pos == '-') { pos++; sign = -1; } + if (*pos == '+') { pos++; } + + while (pos < number + length) { + if ( ret > MAX_VALUE_TO_MULTIPLY ) { + errno = ERANGE; + return sign == 1 ? LLONG_MAX : LLONG_MIN; + } + ret *= 10; + if (LLONG_MAX - ret < (*pos - '0')) { + errno = ERANGE; + return sign == 1 ? LLONG_MAX : LLONG_MIN; + } + ret += (*pos++ - '0'); + } + + return sign * ret; +} + +unsigned char * +yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText, + size_t jsonTextLen, int verbose) +{ + size_t offset = hand->bytesConsumed; + unsigned char * str; + const char * errorType = NULL; + const char * errorText = NULL; + char text[72]; + const char * arrow = " (right here) ------^\n"; + + if (yajl_bs_current(hand->stateStack) == yajl_state_parse_error) { + errorType = "parse"; + errorText = hand->parseError; + } else if (yajl_bs_current(hand->stateStack) == yajl_state_lexical_error) { + errorType = "lexical"; + errorText = yajl_lex_error_to_string(yajl_lex_get_error(hand->lexer)); + } else { + errorType = "unknown"; + } + + { + size_t memneeded = 0; + memneeded += strlen(errorType); + memneeded += strlen(" error"); + if (errorText != NULL) { + memneeded += strlen(": "); + memneeded += strlen(errorText); + } + str = (unsigned char *) YA_MALLOC(&(hand->alloc), memneeded + 2); + if (!str) return NULL; + str[0] = 0; + strcat((char *) str, errorType); + strcat((char *) str, " error"); + if (errorText != NULL) { + strcat((char *) str, ": "); + strcat((char *) str, errorText); + } + strcat((char *) str, "\n"); + } + + /* now we append as many spaces as needed to make sure the error + * falls at char 41, if verbose was specified */ + if (verbose) { + size_t start, end, i; + size_t spacesNeeded; + + spacesNeeded = (offset < 30 ? 40 - offset : 10); + start = (offset >= 30 ? offset - 30 : 0); + end = (offset + 30 > jsonTextLen ? jsonTextLen : offset + 30); + + for (i=0;i<spacesNeeded;i++) text[i] = ' '; + + for (;start < end;start++, i++) { + if (jsonText[start] != '\n' && jsonText[start] != '\r') + { + text[i] = jsonText[start]; + } + else + { + text[i] = ' '; + } + } + assert(i <= 71); + text[i++] = '\n'; + text[i] = 0; + { + char * newStr = (char *) + YA_MALLOC(&(hand->alloc), (unsigned int)(strlen((char *) str) + + strlen((char *) text) + + strlen(arrow) + 1)); + if (newStr) { + newStr[0] = 0; + strcat((char *) newStr, (char *) str); + strcat((char *) newStr, text); + strcat((char *) newStr, arrow); + } + YA_FREE(&(hand->alloc), str); + str = (unsigned char *) newStr; + } + } + return str; +} + +/* check for client cancelation */ +#define _CC_CHK(x) \ + if (!(x)) { \ + yajl_bs_set(hand->stateStack, yajl_state_parse_error); \ + hand->parseError = \ + "client cancelled parse via callback return value"; \ + return yajl_status_client_canceled; \ + } + + +yajl_status +yajl_do_finish(yajl_handle hand) +{ + yajl_status stat; + stat = yajl_do_parse(hand,(const unsigned char *) " ",1); + + if (stat != yajl_status_ok) return stat; + + switch(yajl_bs_current(hand->stateStack)) + { + case yajl_state_parse_error: + case yajl_state_lexical_error: + return yajl_status_error; + case yajl_state_got_value: + case yajl_state_parse_complete: + return yajl_status_ok; + default: + if (!(hand->flags & yajl_allow_partial_values)) + { + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "premature EOF"; + return yajl_status_error; + } + return yajl_status_ok; + } +} + +yajl_status +yajl_do_parse(yajl_handle hand, const unsigned char * jsonText, + size_t jsonTextLen) +{ + yajl_tok tok; + const unsigned char * buf; + size_t bufLen; + size_t * offset = &(hand->bytesConsumed); + + *offset = 0; + + around_again: + switch (yajl_bs_current(hand->stateStack)) { + case yajl_state_parse_complete: + if (hand->flags & yajl_allow_multiple_values) { + yajl_bs_set(hand->stateStack, yajl_state_got_value); + goto around_again; + } + if (!(hand->flags & yajl_allow_trailing_garbage)) { + if (*offset != jsonTextLen) { + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + if (tok != yajl_tok_eof) { + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "trailing garbage"; + } + goto around_again; + } + } + return yajl_status_ok; + case yajl_state_lexical_error: + case yajl_state_parse_error: + return yajl_status_error; + case yajl_state_start: + case yajl_state_got_value: + case yajl_state_map_need_val: + case yajl_state_array_need_val: + case yajl_state_array_start: { + /* for arrays and maps, we advance the state for this + * depth, then push the state of the next depth. + * If an error occurs during the parsing of the nesting + * enitity, the state at this level will not matter. + * a state that needs pushing will be anything other + * than state_start */ + + yajl_state stateToPush = yajl_state_start; + + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + + switch (tok) { + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + case yajl_tok_string: + if (hand->callbacks && hand->callbacks->yajl_string) { + _CC_CHK(hand->callbacks->yajl_string(hand->ctx, + buf, bufLen)); + } + break; + case yajl_tok_string_with_escapes: + if (hand->callbacks && hand->callbacks->yajl_string) { + yajl_buf_clear(hand->decodeBuf); + yajl_string_decode(hand->decodeBuf, buf, bufLen); + _CC_CHK(hand->callbacks->yajl_string( + hand->ctx, yajl_buf_data(hand->decodeBuf), + yajl_buf_len(hand->decodeBuf))); + } + break; + case yajl_tok_bool: + if (hand->callbacks && hand->callbacks->yajl_boolean) { + _CC_CHK(hand->callbacks->yajl_boolean(hand->ctx, + *buf == 't')); + } + break; + case yajl_tok_null: + if (hand->callbacks && hand->callbacks->yajl_null) { + _CC_CHK(hand->callbacks->yajl_null(hand->ctx)); + } + break; + case yajl_tok_left_bracket: + if (hand->callbacks && hand->callbacks->yajl_start_map) { + _CC_CHK(hand->callbacks->yajl_start_map(hand->ctx)); + } + stateToPush = yajl_state_map_start; + break; + case yajl_tok_left_brace: + if (hand->callbacks && hand->callbacks->yajl_start_array) { + _CC_CHK(hand->callbacks->yajl_start_array(hand->ctx)); + } + stateToPush = yajl_state_array_start; + break; + case yajl_tok_integer: + if (hand->callbacks) { + if (hand->callbacks->yajl_number) { + _CC_CHK(hand->callbacks->yajl_number( + hand->ctx,(const char *) buf, bufLen)); + } else if (hand->callbacks->yajl_integer) { + long long int i = 0; + i = yajl_parse_integer(buf, bufLen); + if ((i == LLONG_MIN || i == LLONG_MAX) && + errno == ERANGE) + { + yajl_bs_set(hand->stateStack, + yajl_state_parse_error); + hand->parseError = "integer overflow" ; + /* try to restore error offset */ + if (*offset >= bufLen) *offset -= bufLen; + else *offset = 0; + goto around_again; + } + _CC_CHK(hand->callbacks->yajl_integer(hand->ctx, + i)); + } + } + break; + case yajl_tok_double: + if (hand->callbacks) { + if (hand->callbacks->yajl_number) { + _CC_CHK(hand->callbacks->yajl_number( + hand->ctx, (const char *) buf, bufLen)); + } else if (hand->callbacks->yajl_double) { + double d = 0.0; + yajl_buf_clear(hand->decodeBuf); + yajl_buf_append(hand->decodeBuf, buf, bufLen); + buf = yajl_buf_data(hand->decodeBuf); + d = strtod((char *) buf, NULL); + if ((d == HUGE_VAL || d == -HUGE_VAL) && + errno == ERANGE) + { + yajl_bs_set(hand->stateStack, + yajl_state_parse_error); + hand->parseError = "numeric (floating point) " + "overflow"; + /* try to restore error offset */ + if (*offset >= bufLen) *offset -= bufLen; + else *offset = 0; + goto around_again; + } + _CC_CHK(hand->callbacks->yajl_double(hand->ctx, + d)); + } + } + break; + case yajl_tok_right_brace: { + if (yajl_bs_current(hand->stateStack) == + yajl_state_array_start) + { + if (hand->callbacks && + hand->callbacks->yajl_end_array) + { + _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx)); + } + yajl_bs_pop(hand->stateStack); + goto around_again; + } + /* intentional fall-through */ + } + case yajl_tok_colon: + case yajl_tok_comma: + case yajl_tok_right_bracket: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = + "unallowed token at this point in JSON text"; + goto around_again; + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "invalid token, internal error"; + goto around_again; + } + /* got a value. transition depends on the state we're in. */ + { + yajl_state s = yajl_bs_current(hand->stateStack); + if (s == yajl_state_start || s == yajl_state_got_value) { + yajl_bs_set(hand->stateStack, yajl_state_parse_complete); + } else if (s == yajl_state_map_need_val) { + yajl_bs_set(hand->stateStack, yajl_state_map_got_val); + } else { + yajl_bs_set(hand->stateStack, yajl_state_array_got_val); + } + } + if (stateToPush != yajl_state_start) { + yajl_bs_push(hand->stateStack, stateToPush); + } + + goto around_again; + } + case yajl_state_map_start: + case yajl_state_map_need_key: { + /* only difference between these two states is that in + * start '}' is valid, whereas in need_key, we've parsed + * a comma, and a string key _must_ follow */ + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + switch (tok) { + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + case yajl_tok_string_with_escapes: + if (hand->callbacks && hand->callbacks->yajl_map_key) { + yajl_buf_clear(hand->decodeBuf); + yajl_string_decode(hand->decodeBuf, buf, bufLen); + buf = yajl_buf_data(hand->decodeBuf); + bufLen = yajl_buf_len(hand->decodeBuf); + } + /* intentional fall-through */ + case yajl_tok_string: + if (hand->callbacks && hand->callbacks->yajl_map_key) { + _CC_CHK(hand->callbacks->yajl_map_key(hand->ctx, buf, + bufLen)); + } + yajl_bs_set(hand->stateStack, yajl_state_map_sep); + goto around_again; + case yajl_tok_right_bracket: + if (yajl_bs_current(hand->stateStack) == + yajl_state_map_start) + { + if (hand->callbacks && hand->callbacks->yajl_end_map) { + _CC_CHK(hand->callbacks->yajl_end_map(hand->ctx)); + } + yajl_bs_pop(hand->stateStack); + goto around_again; + } + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = + "invalid object key (must be a string)"; + goto around_again; + } + } + case yajl_state_map_sep: { + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + switch (tok) { + case yajl_tok_colon: + yajl_bs_set(hand->stateStack, yajl_state_map_need_val); + goto around_again; + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "object key and value must " + "be separated by a colon (':')"; + goto around_again; + } + } + case yajl_state_map_got_val: { + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + switch (tok) { + case yajl_tok_right_bracket: + if (hand->callbacks && hand->callbacks->yajl_end_map) { + _CC_CHK(hand->callbacks->yajl_end_map(hand->ctx)); + } + yajl_bs_pop(hand->stateStack); + goto around_again; + case yajl_tok_comma: + yajl_bs_set(hand->stateStack, yajl_state_map_need_key); + goto around_again; + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "after key and value, inside map, " + "I expect ',' or '}'"; + /* try to restore error offset */ + if (*offset >= bufLen) *offset -= bufLen; + else *offset = 0; + goto around_again; + } + } + case yajl_state_array_got_val: { + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + switch (tok) { + case yajl_tok_right_brace: + if (hand->callbacks && hand->callbacks->yajl_end_array) { + _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx)); + } + yajl_bs_pop(hand->stateStack); + goto around_again; + case yajl_tok_comma: + yajl_bs_set(hand->stateStack, yajl_state_array_need_val); + goto around_again; + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = + "after array element, I expect ',' or ']'"; + goto around_again; + } + } + } + + abort(); + return yajl_status_error; +} + diff --git a/xlators/cluster/nsr-server/src/yajl_parser.h b/xlators/cluster/nsr-server/src/yajl_parser.h new file mode 100644 index 000000000..53409731a --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_parser.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_PARSER_H__ +#define __YAJL_PARSER_H__ + +#include "yajl/yajl_parse.h" +#include "yajl_bytestack.h" +#include "yajl_buf.h" +#include "yajl_lex.h" + + +typedef enum { + yajl_state_start = 0, + yajl_state_parse_complete, + yajl_state_parse_error, + yajl_state_lexical_error, + yajl_state_map_start, + yajl_state_map_sep, + yajl_state_map_need_val, + yajl_state_map_got_val, + yajl_state_map_need_key, + yajl_state_array_start, + yajl_state_array_got_val, + yajl_state_array_need_val, + yajl_state_got_value, +} yajl_state; + +struct yajl_handle_t { + const yajl_callbacks * callbacks; + void * ctx; + yajl_lexer lexer; + const char * parseError; + /* the number of bytes consumed from the last client buffer, + * in the case of an error this will be an error offset, in the + * case of an error this can be used as the error offset */ + size_t bytesConsumed; + /* temporary storage for decoded strings */ + yajl_buf decodeBuf; + /* a stack of states. access with yajl_state_XXX routines */ + yajl_bytestack stateStack; + /* memory allocation routines */ + yajl_alloc_funcs alloc; + /* bitfield */ + unsigned int flags; +}; + +yajl_status +yajl_do_parse(yajl_handle handle, const unsigned char * jsonText, + size_t jsonTextLen); + +yajl_status +yajl_do_finish(yajl_handle handle); + +unsigned char * +yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText, + size_t jsonTextLen, int verbose); + +/* A little built in integer parsing routine with the same semantics as strtol + * that's unaffected by LOCALE. */ +long long +yajl_parse_integer(const unsigned char *number, unsigned int length); + + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_tree.c b/xlators/cluster/nsr-server/src/yajl_tree.c new file mode 100644 index 000000000..1a69134e7 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_tree.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2010-2011 Florian Forster <ff at octo.it> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <assert.h> + +#include "yajl/yajl_tree.h" +#include "yajl/yajl_parse.h" + +#include "yajl_parser.h" + +#ifdef WIN32 +#define snprintf sprintf_s +#endif + +#define STATUS_CONTINUE 1 +#define STATUS_ABORT 0 + +struct stack_elem_s; +typedef struct stack_elem_s stack_elem_t; +struct stack_elem_s +{ + char * key; + yajl_val value; + stack_elem_t *next; +}; + +struct context_s +{ + stack_elem_t *stack; + yajl_val root; + char *errbuf; + size_t errbuf_size; +}; +typedef struct context_s context_t; + +#define RETURN_ERROR(ctx,retval,...) { \ + if ((ctx)->errbuf != NULL) \ + snprintf ((ctx)->errbuf, (ctx)->errbuf_size, __VA_ARGS__); \ + return (retval); \ + } + +static yajl_val value_alloc (yajl_type type) +{ + yajl_val v; + + v = malloc (sizeof (*v)); + if (v == NULL) return (NULL); + memset (v, 0, sizeof (*v)); + v->type = type; + + return (v); +} + +static void yajl_object_free (yajl_val v) +{ + size_t i; + + if (!YAJL_IS_OBJECT(v)) return; + + for (i = 0; i < v->u.object.len; i++) + { + free((char *) v->u.object.keys[i]); + v->u.object.keys[i] = NULL; + yajl_tree_free (v->u.object.values[i]); + v->u.object.values[i] = NULL; + } + + free((void*) v->u.object.keys); + free(v->u.object.values); + free(v); +} + +static void yajl_array_free (yajl_val v) +{ + size_t i; + + if (!YAJL_IS_ARRAY(v)) return; + + for (i = 0; i < v->u.array.len; i++) + { + yajl_tree_free (v->u.array.values[i]); + v->u.array.values[i] = NULL; + } + + free(v->u.array.values); + free(v); +} + +/* + * Parsing nested objects and arrays is implemented using a stack. When a new + * object or array starts (a curly or a square opening bracket is read), an + * appropriate value is pushed on the stack. When the end of the object is + * reached (an appropriate closing bracket has been read), the value is popped + * off the stack and added to the enclosing object using "context_add_value". + */ +static int context_push(context_t *ctx, yajl_val v) +{ + stack_elem_t *stack; + + stack = malloc (sizeof (*stack)); + if (stack == NULL) + RETURN_ERROR (ctx, ENOMEM, "Out of memory"); + memset (stack, 0, sizeof (*stack)); + + assert ((ctx->stack == NULL) + || YAJL_IS_OBJECT (v) + || YAJL_IS_ARRAY (v)); + + stack->value = v; + stack->next = ctx->stack; + ctx->stack = stack; + + return (0); +} + +static yajl_val context_pop(context_t *ctx) +{ + stack_elem_t *stack; + yajl_val v; + + if (ctx->stack == NULL) + RETURN_ERROR (ctx, NULL, "context_pop: " + "Bottom of stack reached prematurely"); + + stack = ctx->stack; + ctx->stack = stack->next; + + v = stack->value; + + free (stack); + + return (v); +} + +static int object_add_keyval(context_t *ctx, + yajl_val obj, char *key, yajl_val value) +{ + const char **tmpk; + yajl_val *tmpv; + + /* We're checking for NULL in "context_add_value" or its callers. */ + assert (ctx != NULL); + assert (obj != NULL); + assert (key != NULL); + assert (value != NULL); + + /* We're assuring that "obj" is an object in "context_add_value". */ + assert(YAJL_IS_OBJECT(obj)); + + tmpk = realloc((void *) obj->u.object.keys, sizeof(*(obj->u.object.keys)) * (obj->u.object.len + 1)); + if (tmpk == NULL) + RETURN_ERROR(ctx, ENOMEM, "Out of memory"); + obj->u.object.keys = tmpk; + + tmpv = realloc(obj->u.object.values, sizeof (*obj->u.object.values) * (obj->u.object.len + 1)); + if (tmpv == NULL) + RETURN_ERROR(ctx, ENOMEM, "Out of memory"); + obj->u.object.values = tmpv; + + obj->u.object.keys[obj->u.object.len] = key; + obj->u.object.values[obj->u.object.len] = value; + obj->u.object.len++; + + return (0); +} + +static int array_add_value (context_t *ctx, + yajl_val array, yajl_val value) +{ + yajl_val *tmp; + + /* We're checking for NULL pointers in "context_add_value" or its + * callers. */ + assert (ctx != NULL); + assert (array != NULL); + assert (value != NULL); + + /* "context_add_value" will only call us with array values. */ + assert(YAJL_IS_ARRAY(array)); + + tmp = realloc(array->u.array.values, + sizeof(*(array->u.array.values)) * (array->u.array.len + 1)); + if (tmp == NULL) + RETURN_ERROR(ctx, ENOMEM, "Out of memory"); + array->u.array.values = tmp; + array->u.array.values[array->u.array.len] = value; + array->u.array.len++; + + return 0; +} + +/* + * Add a value to the value on top of the stack or the "root" member in the + * context if the end of the parsing process is reached. + */ +static int context_add_value (context_t *ctx, yajl_val v) +{ + /* We're checking for NULL values in all the calling functions. */ + assert (ctx != NULL); + assert (v != NULL); + + /* + * There are three valid states in which this function may be called: + * - There is no value on the stack => This is the only value. This is the + * last step done when parsing a document. We assign the value to the + * "root" member and return. + * - The value on the stack is an object. In this case store the key on the + * stack or, if the key has already been read, add key and value to the + * object. + * - The value on the stack is an array. In this case simply add the value + * and return. + */ + if (ctx->stack == NULL) + { + assert (ctx->root == NULL); + ctx->root = v; + return (0); + } + else if (YAJL_IS_OBJECT (ctx->stack->value)) + { + if (ctx->stack->key == NULL) + { + if (!YAJL_IS_STRING (v)) + RETURN_ERROR (ctx, EINVAL, "context_add_value: " + "Object key is not a string (%#04x)", + v->type); + + ctx->stack->key = v->u.string; + v->u.string = NULL; + free(v); + return (0); + } + else /* if (ctx->key != NULL) */ + { + char * key; + + key = ctx->stack->key; + ctx->stack->key = NULL; + return (object_add_keyval (ctx, ctx->stack->value, key, v)); + } + } + else if (YAJL_IS_ARRAY (ctx->stack->value)) + { + return (array_add_value (ctx, ctx->stack->value, v)); + } + else + { + RETURN_ERROR (ctx, EINVAL, "context_add_value: Cannot add value to " + "a value of type %#04x (not a composite type)", + ctx->stack->value->type); + } +} + +static int handle_string (void *ctx, + const unsigned char *string, size_t string_length) +{ + yajl_val v; + + v = value_alloc (yajl_t_string); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + v->u.string = malloc (string_length + 1); + if (v->u.string == NULL) + { + free (v); + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + } + memcpy(v->u.string, string, string_length); + v->u.string[string_length] = 0; + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_number (void *ctx, const char *string, size_t string_length) +{ + yajl_val v; + char *endptr; + + v = value_alloc(yajl_t_number); + if (v == NULL) + RETURN_ERROR((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + v->u.number.r = malloc(string_length + 1); + if (v->u.number.r == NULL) + { + free(v); + RETURN_ERROR((context_t *) ctx, STATUS_ABORT, "Out of memory"); + } + memcpy(v->u.number.r, string, string_length); + v->u.number.r[string_length] = 0; + + v->u.number.flags = 0; + + endptr = NULL; + errno = 0; + v->u.number.i = yajl_parse_integer((const unsigned char *) v->u.number.r, + strlen(v->u.number.r)); + if ((errno == 0) && (endptr != NULL) && (*endptr == 0)) + v->u.number.flags |= YAJL_NUMBER_INT_VALID; + + endptr = NULL; + errno = 0; + v->u.number.d = strtod(v->u.number.r, &endptr); + if ((errno == 0) && (endptr != NULL) && (*endptr == 0)) + v->u.number.flags |= YAJL_NUMBER_DOUBLE_VALID; + + return ((context_add_value(ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_start_map (void *ctx) +{ + yajl_val v; + + v = value_alloc(yajl_t_object); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + v->u.object.keys = NULL; + v->u.object.values = NULL; + v->u.object.len = 0; + + return ((context_push (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_end_map (void *ctx) +{ + yajl_val v; + + v = context_pop (ctx); + if (v == NULL) + return (STATUS_ABORT); + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_start_array (void *ctx) +{ + yajl_val v; + + v = value_alloc(yajl_t_array); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + v->u.array.values = NULL; + v->u.array.len = 0; + + return ((context_push (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_end_array (void *ctx) +{ + yajl_val v; + + v = context_pop (ctx); + if (v == NULL) + return (STATUS_ABORT); + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_boolean (void *ctx, int boolean_value) +{ + yajl_val v; + + v = value_alloc (boolean_value ? yajl_t_true : yajl_t_false); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_null (void *ctx) +{ + yajl_val v; + + v = value_alloc (yajl_t_null); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +/* + * Public functions + */ +yajl_val yajl_tree_parse (const char *input, + char *error_buffer, size_t error_buffer_size) +{ + static const yajl_callbacks callbacks = + { + /* null = */ handle_null, + /* boolean = */ handle_boolean, + /* integer = */ NULL, + /* double = */ NULL, + /* number = */ handle_number, + /* string = */ handle_string, + /* start map = */ handle_start_map, + /* map key = */ handle_string, + /* end map = */ handle_end_map, + /* start array = */ handle_start_array, + /* end array = */ handle_end_array + }; + + yajl_handle handle; + yajl_status status; + context_t ctx = { NULL, NULL, NULL, 0 }; + + ctx.errbuf = error_buffer; + ctx.errbuf_size = error_buffer_size; + + if (error_buffer != NULL) + memset (error_buffer, 0, error_buffer_size); + + handle = yajl_alloc (&callbacks, NULL, &ctx); + yajl_config(handle, yajl_allow_comments, 1); + + status = yajl_parse(handle, + (unsigned char *) input, + strlen (input)); + status = yajl_complete_parse (handle); + if (status != yajl_status_ok) { + if (error_buffer != NULL && error_buffer_size > 0) { + snprintf( + error_buffer, error_buffer_size, "%s", + (char *) yajl_get_error(handle, 1, + (const unsigned char *) input, + strlen(input))); + } + yajl_free (handle); + return NULL; + } + + yajl_free (handle); + return (ctx.root); +} + +yajl_val yajl_tree_get(yajl_val n, const char ** path, yajl_type type) +{ + if (!path) return NULL; + while (n && *path) { + unsigned int i; + + if (n->type != yajl_t_object) return NULL; + for (i = 0; i < n->u.object.len; i++) { + if (!strcmp(*path, n->u.object.keys[i])) { + n = n->u.object.values[i]; + break; + } + } + if (i == n->u.object.len) return NULL; + path++; + } + if (n && type != yajl_t_any && type != n->type) n = NULL; + return n; +} + +void yajl_tree_free (yajl_val v) +{ + if (v == NULL) return; + + if (YAJL_IS_STRING(v)) + { + free(v->u.string); + free(v); + } + else if (YAJL_IS_NUMBER(v)) + { + free(v->u.number.r); + free(v); + } + else if (YAJL_GET_OBJECT(v)) + { + yajl_object_free(v); + } + else if (YAJL_GET_ARRAY(v)) + { + yajl_array_free(v); + } + else /* if (yajl_t_true or yajl_t_false or yajl_t_null) */ + { + free(v); + } +} diff --git a/xlators/cluster/nsr-server/src/yajl_version.c b/xlators/cluster/nsr-server/src/yajl_version.c new file mode 100644 index 000000000..0671da722 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_version.c @@ -0,0 +1,7 @@ +#include <yajl/yajl_version.h> + +int yajl_version(void) +{ + return YAJL_VERSION; +} + diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am index 180d0da1f..4268d6f03 100644 --- a/xlators/cluster/stripe/src/Makefile.am +++ b/xlators/cluster/stripe/src/Makefile.am @@ -1,16 +1,19 @@ - xlator_LTLIBRARIES = stripe.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -stripe_la_LDFLAGS = -module -avoidversion +stripe_la_LDFLAGS = -module -avoid-version + +stripe_la_SOURCES = stripe.c stripe-helpers.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -stripe_la_SOURCES = stripe.c stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = stripe.h +noinst_HEADERS = stripe.h stripe-mem-types.h $(top_builddir)/xlators/lib/src/libxlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/xlators/lib/src -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/stripe/src/stripe-helpers.c b/xlators/cluster/stripe/src/stripe-helpers.c new file mode 100644 index 000000000..3c12809d6 --- /dev/null +++ b/xlators/cluster/stripe/src/stripe-helpers.c @@ -0,0 +1,677 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <fnmatch.h> + +#include "stripe.h" +#include "byte-order.h" +#include "mem-types.h" +#include "logging.h" + +void +stripe_local_wipe (stripe_local_t *local) +{ + if (!local) + goto out; + + loc_wipe (&local->loc); + loc_wipe (&local->loc2); + + if (local->fd) + fd_unref (local->fd); + + if (local->inode) + inode_unref (local->inode); + + if (local->xattr) + dict_unref (local->xattr); + + if (local->xdata) + dict_unref (local->xdata); + +out: + return; +} + + + +int +stripe_aggregate (dict_t *this, char *key, data_t *value, void *data) +{ + dict_t *dst = NULL; + int64_t *ptr = 0, *size = NULL; + int32_t ret = -1; + + dst = data; + + if (strcmp (key, GF_XATTR_QUOTA_SIZE_KEY) == 0) { + ret = dict_get_bin (dst, key, (void **)&size); + if (ret < 0) { + size = GF_CALLOC (1, sizeof (int64_t), + gf_common_mt_char); + if (size == NULL) { + gf_log ("stripe", GF_LOG_WARNING, + "memory allocation failed"); + goto out; + } + ret = dict_set_bin (dst, key, size, sizeof (int64_t)); + if (ret < 0) { + gf_log ("stripe", GF_LOG_WARNING, + "stripe aggregate dict set failed"); + GF_FREE (size); + goto out; + } + } + + ptr = data_to_bin (value); + if (ptr == NULL) { + gf_log ("stripe", GF_LOG_WARNING, "data to bin failed"); + goto out; + } + + *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + } else if (strcmp (key, GF_CONTENT_KEY)) { + /* No need to aggregate 'CONTENT' data */ + ret = dict_set (dst, key, value); + if (ret) + gf_log ("stripe", GF_LOG_WARNING, "xattr dict set failed"); + } + +out: + return 0; +} + + +void +stripe_aggregate_xattr (dict_t *dst, dict_t *src) +{ + if ((dst == NULL) || (src == NULL)) { + goto out; + } + + dict_foreach (src, stripe_aggregate, dst); +out: + return; +} + + +int32_t +stripe_xattr_aggregate (char *buffer, stripe_local_t *local, int32_t *total) +{ + int32_t i = 0; + int32_t ret = -1; + int32_t len = 0; + char *sbuf = NULL; + stripe_xattr_sort_t *xattr = NULL; + + if (!buffer || !local || !local->xattr_list) + goto out; + + sbuf = buffer; + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + len = xattr->xattr_len; + + if (len && xattr && xattr->xattr_value) { + memcpy (buffer, xattr->xattr_value, len); + buffer += len; + *buffer++ = ' '; + } + } + + *--buffer = '\0'; + if (total) + *total = buffer - sbuf; + ret = 0; + + out: + return ret; +} + +int32_t +stripe_free_xattr_str (stripe_local_t *local) +{ + int32_t i = 0; + int32_t ret = -1; + stripe_xattr_sort_t *xattr = NULL; + + if (!local || !local->xattr_list) + goto out; + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + + if (xattr && xattr->xattr_value) + GF_FREE (xattr->xattr_value); + } + + ret = 0; + out: + return ret; +} + + +int32_t +stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local, + void **xattr_serz) +{ + int32_t ret = -1, i = 0, len = 0; + dict_t *tmp1 = NULL, *tmp2 = NULL; + char *buf = NULL; + stripe_xattr_sort_t *xattr = NULL; + + if (xattr_serz == NULL) { + goto out; + } + + tmp2 = dict_new (); + + if (tmp2 == NULL) { + goto out; + } + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + len = xattr->xattr_len; + + if (len && xattr && xattr->xattr_value) { + ret = dict_reset (tmp2); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "dict_reset failed (%s)", + strerror (-ret)); + } + + ret = dict_unserialize (xattr->xattr_value, + xattr->xattr_len, + &tmp2); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "dict_unserialize failed (%s)", + strerror (-ret)); + ret = -1; + goto out; + } + + tmp1 = dict_copy (tmp2, tmp1); + if (tmp1 == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "dict_copy failed (%s)", + strerror (-ret)); + ret = -1; + goto out; + } + } + } + + len = dict_serialized_length (tmp1); + if (len > 0) { + buf = GF_CALLOC (1, len, gf_common_mt_dict_t); + if (buf == NULL) { + ret = -1; + goto out; + } + + ret = dict_serialize (tmp1, buf); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "dict_serialize failed (%s)", strerror (-ret)); + GF_FREE(buf); + ret = -1; + goto out; + } + + *xattr_serz = buf; + } + + ret = 0; +out: + if (tmp1 != NULL) { + dict_unref (tmp1); + } + + if (tmp2 != NULL) { + dict_unref (tmp2); + } + + return ret; +} + + +int32_t +stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local, + char **xattr_serz) +{ + int ret = -1; + int32_t padding = 0; + int32_t tlen = 0; + char stripe_size_str[20] = {0,}; + char *pathinfo_serz = NULL; + + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "Possible NULL deref"); + goto out; + } + + (void) snprintf (stripe_size_str, 20, "%"PRId64, + (long long) (local->fctx) ? local->fctx->stripe_size : 0); + + /* extra bytes for decorations (brackets and <>'s) */ + padding = strlen (this->name) + strlen (STRIPE_PATHINFO_HEADER) + + strlen (stripe_size_str) + 7; + local->xattr_total_len += (padding + 2); + + pathinfo_serz = GF_CALLOC (local->xattr_total_len, sizeof (char), + gf_common_mt_char); + if (!pathinfo_serz) + goto out; + + /* xlator info */ + (void) sprintf (pathinfo_serz, "(<"STRIPE_PATHINFO_HEADER"%s:[%s]> ", + this->name, stripe_size_str); + + ret = stripe_xattr_aggregate (pathinfo_serz + padding, local, &tlen); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot aggregate pathinfo list"); + GF_FREE(pathinfo_serz); + goto out; + } + + *(pathinfo_serz + padding + tlen) = ')'; + *(pathinfo_serz + padding + tlen + 1) = '\0'; + + *xattr_serz = pathinfo_serz; + + ret = 0; + out: + return ret; +} + +/** + * stripe_get_matching_bs - Get the matching block size for the given path. + */ +int32_t +stripe_get_matching_bs (const char *path, stripe_private_t *priv) +{ + struct stripe_options *trav = NULL; + uint64_t block_size = 0; + + GF_VALIDATE_OR_GOTO ("stripe", priv, out); + GF_VALIDATE_OR_GOTO ("stripe", path, out); + + LOCK (&priv->lock); + { + block_size = priv->block_size; + trav = priv->pattern; + while (trav) { + if (!fnmatch (trav->path_pattern, path, FNM_NOESCAPE)) { + block_size = trav->block_size; + break; + } + trav = trav->next; + } + } + UNLOCK (&priv->lock); + +out: + return block_size; +} + +int32_t +stripe_ctx_handle (xlator_t *this, call_frame_t *prev, stripe_local_t *local, + dict_t *dict) +{ + char key[256] = {0,}; + data_t *data = NULL; + int32_t index = 0; + stripe_private_t *priv = NULL; + + priv = this->private; + + + if (!local->fctx) { + local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), + gf_stripe_mt_stripe_fd_ctx_t); + if (!local->fctx) { + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + + local->fctx->static_array = 0; + } + /* Stripe block size */ + sprintf (key, "trusted.%s.stripe-size", this->name); + data = dict_get (dict, key); + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-size"); + goto out; + } else { + if (!local->fctx->stripe_size) { + local->fctx->stripe_size = + data_to_int64 (data); + } + + if (local->fctx->stripe_size != data_to_int64 (data)) { + gf_log (this->name, GF_LOG_WARNING, + "stripe-size mismatch in blocks"); + local->xattr_self_heal_needed = 1; + } + } + + /* Stripe count */ + sprintf (key, "trusted.%s.stripe-count", this->name); + data = dict_get (dict, key); + + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-count"); + goto out; + } + if (!local->fctx->xl_array) { + local->fctx->stripe_count = data_to_int32 (data); + if (!local->fctx->stripe_count) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-count xattr"); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + + local->fctx->xl_array = GF_CALLOC (local->fctx->stripe_count, + sizeof (xlator_t *), + gf_stripe_mt_xlator_t); + + if (!local->fctx->xl_array) { + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + } + if (local->fctx->stripe_count != data_to_int32 (data)) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-count xattr (%d != %d)", + local->fctx->stripe_count, data_to_int32 (data)); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + + /* index */ + sprintf (key, "trusted.%s.stripe-index", this->name); + data = dict_get (dict, key); + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-index"); + goto out; + } + index = data_to_int32 (data); + if (index > priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-index xattr (%d)", index); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + if (local->fctx->xl_array) { + if (!local->fctx->xl_array[index]) + local->fctx->xl_array[index] = prev->this; + } + + sprintf(key, "trusted.%s.stripe-coalesce", this->name); + data = dict_get(dict, key); + if (!data) { + /* + * The file was probably created prior to coalesce support. + * Assume non-coalesce mode for this file to maintain backwards + * compatibility. + */ + gf_log(this->name, GF_LOG_DEBUG, "missing stripe-coalesce " + "attr, assume non-coalesce mode"); + local->fctx->stripe_coalesce = 0; + } else { + local->fctx->stripe_coalesce = data_to_int32(data); + } + + +out: + return 0; +} + +int32_t +stripe_xattr_request_build (xlator_t *this, dict_t *dict, uint64_t stripe_size, + uint32_t stripe_count, uint32_t stripe_index, + uint32_t stripe_coalesce) +{ + char key[256] = {0,}; + int32_t ret = -1; + + sprintf (key, "trusted.%s.stripe-size", this->name); + ret = dict_set_int64 (dict, key, stripe_size); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf (key, "trusted.%s.stripe-count", this->name); + ret = dict_set_int32 (dict, key, stripe_count); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf (key, "trusted.%s.stripe-index", this->name); + ret = dict_set_int32 (dict, key, stripe_index); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf(key, "trusted.%s.stripe-coalesce", this->name); + ret = dict_set_int32(dict, key, stripe_coalesce); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req_dict", key); + goto out; + } +out: + return ret; +} + + +static int +set_default_block_size (stripe_private_t *priv, char *num) +{ + + int ret = -1; + GF_VALIDATE_OR_GOTO ("stripe", THIS, out); + GF_VALIDATE_OR_GOTO (THIS->name, priv, out); + GF_VALIDATE_OR_GOTO (THIS->name, num, out); + + + if (gf_string2bytesize_uint64 (num, &priv->block_size) != 0) { + gf_log (THIS->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + + ret = 0; + + out: + return ret; + +} + + +int +set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *num = NULL; + struct stripe_options *temp_stripeopt = NULL; + struct stripe_options *stripe_opt = NULL; + + if (!this || !priv || !data) + goto out; + + /* Get the pattern for striping. + "option block-size *avi:10MB" etc */ + stripe_str = strtok_r (data, ",", &tmp_str); + while (stripe_str) { + dup_str = gf_strdup (stripe_str); + stripe_opt = GF_CALLOC (1, sizeof (struct stripe_options), + gf_stripe_mt_stripe_options); + if (!stripe_opt) { + goto out; + } + + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (!num) { + num = pattern; + pattern = "*"; + ret = set_default_block_size (priv, num); + if (ret) + goto out; + } + if (gf_string2bytesize_uint64 (num, &stripe_opt->block_size) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + + if (stripe_opt->block_size < STRIPE_MIN_BLOCK_SIZE) { + gf_log (this->name, GF_LOG_ERROR, "Invalid Block-size: " + "%s. Should be atleast %llu bytes", num, + STRIPE_MIN_BLOCK_SIZE); + goto out; + } + if (stripe_opt->block_size % 512) { + gf_log (this->name, GF_LOG_ERROR, "Block-size: %s should" + " be a multiple of 512 bytes", num); + goto out; + } + + memcpy (stripe_opt->path_pattern, pattern, strlen (pattern)); + + gf_log (this->name, GF_LOG_DEBUG, + "block-size : pattern %s : size %"PRId64, + stripe_opt->path_pattern, stripe_opt->block_size); + + if (priv->pattern) + temp_stripeopt = NULL; + else + temp_stripeopt = priv->pattern; + + stripe_opt->next = temp_stripeopt; + + priv->pattern = stripe_opt; + stripe_opt = NULL; + + GF_FREE (dup_str); + dup_str = NULL; + + stripe_str = strtok_r (NULL, ",", &tmp_str); + } + + ret = 0; +out: + + GF_FREE (dup_str); + + GF_FREE (stripe_opt); + + return ret; +} + +int32_t +stripe_iatt_merge (struct iatt *from, struct iatt *to) +{ + if (to->ia_size < from->ia_size) + to->ia_size = from->ia_size; + if (to->ia_mtime < from->ia_mtime) + to->ia_mtime = from->ia_mtime; + if (to->ia_ctime < from->ia_ctime) + to->ia_ctime = from->ia_ctime; + if (to->ia_atime < from->ia_atime) + to->ia_atime = from->ia_atime; + return 0; +} + +off_t +coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count) +{ + size_t line_size = 0; + uint64_t stripe_num = 0; + off_t coalesced_offset = 0; + + line_size = stripe_size * stripe_count; + stripe_num = offset / line_size; + + coalesced_offset = (stripe_num * stripe_size) + + (offset % stripe_size); + + return coalesced_offset; +} + +off_t +uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count, + int stripe_index) +{ + uint64_t nr_full_stripe_chunks = 0, mod = 0; + + if (!size) + return size; + + /* + * Estimate the number of fully written stripes from the + * local file size. Each stripe_size chunk corresponds to + * a stripe. + */ + nr_full_stripe_chunks = (size / stripe_size) * stripe_count; + mod = size % stripe_size; + + if (!mod) { + /* + * There is no remainder, thus we could have overestimated + * the size of the file in terms of chunks. Trim the number + * of chunks by the following stripe members and leave it + * up to those nodes to respond with a larger size (if + * necessary). + */ + nr_full_stripe_chunks -= stripe_count - + (stripe_index + 1); + size = nr_full_stripe_chunks * stripe_size; + } else { + /* + * There is a remainder and thus we own the last chunk of the + * file. Add the preceding stripe members of the final stripe + * along with the remainder to calculate the exact size. + */ + nr_full_stripe_chunks += stripe_index; + size = nr_full_stripe_chunks * stripe_size + mod; + } + + return size; +} diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h new file mode 100644 index 000000000..e9ac9cf46 --- /dev/null +++ b/xlators/cluster/stripe/src/stripe-mem-types.h @@ -0,0 +1,31 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __STRIPE_MEM_TYPES_H__ +#define __STRIPE_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_stripe_mem_types_ { + gf_stripe_mt_iovec = gf_common_mt_end + 1, + gf_stripe_mt_stripe_replies, + gf_stripe_mt_stripe_fd_ctx_t, + gf_stripe_mt_char, + gf_stripe_mt_int8_t, + gf_stripe_mt_int32_t, + gf_stripe_mt_xlator_t, + gf_stripe_mt_stripe_private_t, + gf_stripe_mt_stripe_options, + gf_stripe_mt_xattr_sort_t, + gf_stripe_mt_end +}; +#endif + diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index 64c81a539..0ebea8168 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -1,305 +1,182 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /** * xlators/cluster/stripe: - * Stripe translator, stripes the data accross its child nodes, - * as per the options given in the volfile. The striping works - * fairly simple. It writes files at different offset as per - * calculation. So, 'ls -l' output at the real posix level will - * show file size bigger than the actual size. But when one does + * Stripe translator, stripes the data across its child nodes, + * as per the options given in the volfile. The striping works + * fairly simple. It writes files at different offset as per + * calculation. So, 'ls -l' output at the real posix level will + * show file size bigger than the actual size. But when one does * 'df' or 'du <file>', real size of the file on the server is shown. * * WARNING: * Stripe translator can't regenerate data if a child node gets disconnected. - * So, no 'self-heal' for stripe. Hence the advice, use stripe only when its - * very much necessary, or else, use it in combination with AFR, to have a - * backup copy. - */ - -/* TODO: - * 1. Implement basic self-heal ability to manage the basic backend - * layout missmatch. - * + * So, no 'self-heal' for stripe. Hence the advice, use stripe only when its + * very much necessary, or else, use it in combination with AFR, to have a + * backup copy. */ +#include <fnmatch.h> #include "stripe.h" +#include "libxlator.h" +#include "byte-order.h" +#include "statedump.h" -/** - * stripe_get_matching_bs - Get the matching block size for the given path. - */ -int32_t -stripe_get_matching_bs (const char *path, struct stripe_options *opts, - uint64_t default_bs) -{ - struct stripe_options *trav = NULL; - char *pathname = NULL; - uint64_t block_size = 0; - - block_size = default_bs; - pathname = strdup (path); - trav = opts; - - while (trav) { - if (!fnmatch (trav->path_pattern, pathname, FNM_NOESCAPE)) { - block_size = trav->block_size; - break; - } - trav = trav->next; - } - free (pathname); - - return block_size; -} - +struct volume_options options[]; -/* - * stripe_common_cbk - - */ int32_t -stripe_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * stripe_stack_unwind_cbk - This function is used for all the _cbk without - * any extra arguments (other than the minimum given) - * This is called from functions like fsync,unlink,rmdir etc. - * - */ -int32_t -stripe_stack_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +stripe_sh_chown_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - int32_t callcnt = 0; + int callcnt = -1; stripe_local_t *local = NULL; + if (!this || !frame || !frame->local) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + local = frame->local; LOCK (&frame->lock); { callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->op_errno = op_errno; - if (op_errno == ENOTCONN) - local->failed = 1; - } - if (op_ret >= 0) - local->op_ret = op_ret; } UNLOCK (&frame->lock); if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->loc.path) - loc_wipe (&local->loc); - if (local->loc2.path) - loc_wipe (&local->loc2); - - STACK_UNWIND (frame, local->op_ret, local->op_errno); + STRIPE_STACK_DESTROY (frame); } +out: return 0; } -int32_t -stripe_common_buf_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -/** - * stripe_stack_unwind_buf_cbk - This function is used for all the _cbk with - * 'struct stat *buf' as extra argument (other than minimum) - * This is called from functions like, chmod, fchmod, chown, fchown, - * truncate, ftruncate, utimens etc. - * - * @cookie - this argument should be always 'xlator_t *' of child node - */ -int32_t -stripe_stack_unwind_buf_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct stat *buf) +int32_t +stripe_sh_make_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int32_t callcnt = 0; stripe_local_t *local = NULL; + call_frame_t *prev = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->op_errno = op_errno; - if (op_errno == ENOTCONN) - local->failed = 1; - } - - if (op_ret == 0) { - local->op_ret = 0; - if (local->stbuf.st_blksize == 0) { - local->stbuf = *buf; - /* Because st_blocks gets added again */ - local->stbuf.st_blocks = 0; - } - - if (FIRST_CHILD(this) == - ((call_frame_t *)cookie)->this) { - /* Always, pass the inode number of - first child to the above layer */ - local->stbuf.st_ino = buf->st_ino; - local->stbuf.st_mtime = buf->st_mtime; - } - - local->stbuf.st_blocks += buf->st_blocks; - if (local->stbuf.st_size < buf->st_size) - local->stbuf.st_size = buf->st_size; - if (local->stbuf.st_blksize != buf->st_blksize) { - /* TODO: add to blocks in terms of - original block size */ - } - } + if (!frame || !frame->local || !cookie || !this) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - if (local->loc.path) - loc_wipe (&local->loc); - if (local->loc2.path) - loc_wipe (&local->loc2); + prev = cookie; + local = frame->local; - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } + STACK_WIND (frame, stripe_sh_chown_cbk, prev->this, + prev->this->fops->setattr, &local->loc, + &local->stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); +out: return 0; } -/* In case of symlink, mknod, the file is created on just first node */ -int32_t -stripe_common_inode_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) +int32_t +stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, + stripe_local_t *local) { - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; -} + xlator_list_t *trav = NULL; + call_frame_t *rframe = NULL; + stripe_local_t *rlocal = NULL; + stripe_private_t *priv = NULL; + dict_t *xdata = NULL; + int ret = 0; -/** - * stripe_stack_unwind_inode_cbk - This is called by the function like, - * link (), symlink (), mkdir (), mknod () - * This creates a inode for new inode. It keeps a list of all - * the inodes received from the child nodes. It is used while - * forwarding any fops to child nodes. - * - */ -int32_t -stripe_stack_unwind_inode_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct stat *buf) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; + if (!local || !this || !frame) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } - local = frame->local; + if (!(IA_ISREG (local->stbuf.ia_type) || + IA_ISDIR (local->stbuf.ia_type))) + return 0; - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->op_errno = op_errno; - if (op_errno == ENOTCONN) - local->failed = 1; - } - - if (op_ret >= 0) { - local->op_ret = 0; + priv = this->private; + trav = this->children; + rframe = copy_frame (frame); + if (!rframe) { + goto out; + } + rlocal = mem_get0 (this->local_pool); + if (!rlocal) { + goto out; + } + rframe->local = rlocal; + rlocal->call_count = priv->child_count; + loc_copy (&rlocal->loc, &local->loc); + memcpy (&rlocal->stbuf, &local->stbuf, sizeof (struct iatt)); - if (local->stbuf.st_blksize == 0) { - local->inode = inode; - local->stbuf = *buf; - /* Because st_blocks gets added again */ - local->stbuf.st_blocks = 0; - } - if (FIRST_CHILD(this) == - ((call_frame_t *)cookie)->this) { - local->stbuf.st_ino = buf->st_ino; - local->stbuf.st_mtime = buf->st_mtime; - } + xdata = dict_new (); + if (!xdata) + goto out; - local->stbuf.st_blocks += buf->st_blocks; - if (local->stbuf.st_size < buf->st_size) - local->stbuf.st_size = buf->st_size; - if (local->stbuf.st_blksize != buf->st_blksize) { - /* TODO: add to blocks in terms of - original block size */ - } + ret = dict_set_static_bin (xdata, "gfid-req", local->stbuf.ia_gfid, 16); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set gfid-req", local->loc.path); + + while (trav) { + if (IA_ISREG (local->stbuf.ia_type)) { + STACK_WIND (rframe, stripe_sh_make_entry_cbk, + trav->xlator, trav->xlator->fops->mknod, + &local->loc, + st_mode_from_ia (local->stbuf.ia_prot, + local->stbuf.ia_type), + 0, 0, xdata); } + if (IA_ISDIR (local->stbuf.ia_type)) { + STACK_WIND (rframe, stripe_sh_make_entry_cbk, + trav->xlator, trav->xlator->fops->mkdir, + &local->loc, + st_mode_from_ia (local->stbuf.ia_prot, + local->stbuf.ia_type), + 0, xdata); + } + trav = trav->next; } - UNLOCK (&frame->lock); - if (!callcnt) { - if (local->failed) - local->op_ret = -1; + if (xdata) + dict_unref (xdata); + return 0; - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf); - } +out: + if (rframe) + STRIPE_STACK_DESTROY (rframe); + if (xdata) + dict_unref (xdata); return 0; } -int32_t -stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + +int32_t +stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf, dict_t *dict) + struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - int32_t callcnt = 0; - dict_t *tmp_dict = NULL; - inode_t *tmp_inode = NULL; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = 0; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } prev = cookie; local = frame->local; @@ -307,89 +184,115 @@ stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { callcnt = --local->call_count; - + if (op_ret == -1) { if (op_errno != ENOENT) - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", prev->this->name, strerror (op_errno)); if (local->op_errno != ESTALE) local->op_errno = op_errno; - if ((op_errno == ENOTCONN) || (op_errno == ESTALE)) + if (((op_errno != ENOENT) && (op_errno != ENOTCONN)) || + (prev->this == FIRST_CHILD (this))) local->failed = 1; - /* TODO: bring in self-heal ability */ - /* - * if (local->op_ret == 0) { - * if (S_ISREG (local->stbuf.st_mode) || - * S_ISDIR (local->stbuf.st_mode)) - * local->entry_self_heal_needed = 1; - * } - */ - } - + if (op_errno == ENOENT) + local->entry_self_heal_needed = 1; + } + if (op_ret >= 0) { local->op_ret = 0; + if (IA_ISREG (buf->ia_type)) { + ret = stripe_ctx_handle (this, prev, local, + xdata); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Error getting fctx info from" + " dict"); + } - if (local->stbuf.st_blksize == 0) { + if (FIRST_CHILD(this) == prev->this) { + local->stbuf = *buf; + local->postparent = *postparent; local->inode = inode_ref (inode); - local->stbuf = *buf; - /* Because st_blocks gets added again */ - local->stbuf.st_blocks = 0; + if (xdata) + local->xdata = dict_ref (xdata); + if (local->xattr) { + stripe_aggregate_xattr (local->xdata, + local->xattr); + dict_unref (local->xattr); + local->xattr = NULL; + } } - if (FIRST_CHILD(this) == prev->this) { - local->stbuf.st_ino = buf->st_ino; - local->stbuf.st_mtime = buf->st_mtime; - if (local->dict) - dict_unref (local->dict); - local->dict = dict_ref (dict); - } else { - if (!local->dict) - local->dict = dict_ref (dict); + + if (!local->xdata && !local->xattr) { + local->xattr = dict_ref (xdata); + } else if (local->xdata) { + stripe_aggregate_xattr (local->xdata, xdata); + } else if (local->xattr) { + stripe_aggregate_xattr (local->xattr, xdata); } - local->stbuf.st_blocks += buf->st_blocks; - if (local->stbuf.st_size < buf->st_size) - local->stbuf.st_size = buf->st_size; - if (local->stbuf.st_blksize != buf->st_blksize) { - /* TODO: add to blocks in terms of - original block size */ + + local->stbuf_blocks += buf->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + if (local->postparent_size < postparent->ia_size) + local->postparent_size = postparent->ia_size; + + if (uuid_is_null (local->ia_gfid)) + uuid_copy (local->ia_gfid, buf->ia_gfid); + + /* Make sure the gfid on all the nodes are same */ + if (uuid_compare (local->ia_gfid, buf->ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid different on subvolume %s", + local->loc.path, prev->this->name); } } } UNLOCK (&frame->lock); if (!callcnt) { + if (local->op_ret == 0 && local->entry_self_heal_needed && + !uuid_is_null (local->loc.inode->gfid)) + stripe_entry_self_heal (frame, this, local); + if (local->failed) local->op_ret = -1; - tmp_dict = local->dict; - tmp_inode = local->inode; - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->dict); + if (local->op_ret != -1) { + local->stbuf.ia_blocks = local->stbuf_blocks; + local->stbuf.ia_size = local->stbuf_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + inode_ctx_put (local->inode, this, + (uint64_t) (long)local->fctx); + } - if (tmp_inode) - inode_unref (tmp_inode); - if (tmp_dict) - dict_unref (tmp_dict); + STRIPE_STACK_UNWIND (lookup, frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, local->xdata, + &local->postparent); } - +out: return 0; } - -/** - * stripe_lookup - - */ -int32_t -stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) +int32_t +stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) { - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - char send_lookup_to_all = 0; - int32_t op_errno = 1; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + int32_t op_errno = EINVAL; + int64_t filesize = 0; + int ret = 0; + uint64_t tmpctx = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -401,172 +304,136 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, trav = this->children; /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; frame->local = local; + loc_copy (&local->loc, loc); - if ((!loc->inode->st_mode) || S_ISDIR (loc->inode->st_mode) || - S_ISREG (loc->inode->st_mode)) { - send_lookup_to_all = 1; - } + inode_ctx_get (local->inode, this, &tmpctx); + if (tmpctx) + local->fctx = (stripe_fd_ctx_t*) (long)tmpctx; - if (send_lookup_to_all) { - /* Everytime in stripe lookup, all child nodes - should be looked up */ - local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_lookup_cbk, trav->xlator, - trav->xlator->fops->lookup, - loc, xattr_req); - trav = trav->next; - } - } else { - local->call_count = 1; - - STACK_WIND (frame, stripe_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, xattr_req); + /* quick-read friendly changes */ + if (xdata && dict_get (xdata, GF_CONTENT_KEY)) { + ret = dict_get_int64 (xdata, GF_CONTENT_KEY, &filesize); + if (!ret && (filesize > priv->block_size)) + dict_del (xdata, GF_CONTENT_KEY); } - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - return 0; -} - -/** - * stripe_stat - - */ -int32_t -stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - int send_lookup_to_all = 0; - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - priv = this->private; - trav = this->children; + /* get stripe-size xattr on lookup. This would be required for + * open/read/write/pathinfo calls. Hence we send down the request + * even when type == IA_INVAL */ + + /* + * We aren't guaranteed to have xdata here. We need the format info for + * the file, so allocate xdata if necessary. + */ + if (!xdata) + xdata = dict_new(); + else + xdata = dict_ref(xdata); + + if (xdata && (IA_ISREG (loc->inode->ia_type) || + (loc->inode->ia_type == IA_INVAL))) { + ret = stripe_xattr_request_build (this, xdata, 8, 4, 4, 0); + if (ret) + gf_log (this->name , GF_LOG_ERROR, "Failed to build" + " xattr request for %s", loc->path); - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; } - if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) - send_lookup_to_all = 1; - - if (!send_lookup_to_all) { - STACK_WIND (frame, stripe_common_buf_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc); - } else { - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->inode = loc->inode; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, - trav->xlator, trav->xlator->fops->stat, - loc); - trav = trav->next; - } + /* Everytime in stripe lookup, all child nodes + should be looked up */ + local->call_count = priv->child_count; + while (trav) { + STACK_WIND (frame, stripe_lookup_cbk, trav->xlator, + trav->xlator->fops->lookup, loc, xdata); + trav = trav->next; } + dict_unref(xdata); + return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); return 0; } -/** - * stripe_chmod - - */ int32_t -stripe_chmod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode) +stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - int send_fop_to_all = 0; - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = 1; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + prev = cookie; + local = frame->local; - priv = this->private; - trav = this->children; + LOCK (&frame->lock); + { + callcnt = --local->call_count; - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } - if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) - send_fop_to_all = 1; + if (op_ret == 0) { + local->op_ret = 0; - if (!send_fop_to_all) { - STACK_WIND (frame, stripe_common_buf_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->chmod, loc, mode); - } else { - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; + if (FIRST_CHILD(this) == prev->this) { + local->stbuf = *buf; + } + + local->stbuf_blocks += buf->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; } - local->op_ret = -1; - frame->local = local; - local->inode = loc->inode; - local->call_count = priv->child_count; + } + UNLOCK (&frame->lock); - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, - trav->xlator, trav->xlator->fops->chmod, - loc, mode); - trav = trav->next; + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret != -1) { + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; } + + STRIPE_STACK_UNWIND (stat, frame, local->op_ret, + local->op_errno, &local->stbuf, NULL); } - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +out: return 0; } - -/** - * stripe_chown - - */ int32_t -stripe_chown (call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, - gid_t gid) +stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - int send_fop_to_all = 0; xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - int32_t op_errno = 1; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -582,49 +449,48 @@ stripe_chown (call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, goto err; } - if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) - send_fop_to_all = 1; + /* Initialization */ + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; - if (!send_fop_to_all) { - STACK_WIND (frame, stripe_common_buf_cbk, trav->xlator, - trav->xlator->fops->chown, loc, uid, gid); - } else { - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->inode = loc->inode; - local->call_count = priv->child_count; + if (IA_ISREG(loc->inode->ia_type)) { + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, - trav->xlator, trav->xlator->fops->chown, - loc, uid, gid); - trav = trav->next; - } + while (trav) { + STACK_WIND (frame, stripe_stat_cbk, trav->xlator, + trav->xlator->fops->stat, loc, NULL); + trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); + +err: + STRIPE_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } -/** - * stripe_statfs_cbk - - */ int32_t stripe_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *stbuf) + int32_t op_ret, int32_t op_errno, struct statvfs *stbuf, dict_t *xdata) { stripe_local_t *local = NULL; int32_t callcnt = 0; + if (!this || !frame || !frame->local) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } local = frame->local; LOCK(&frame->lock); @@ -651,32 +517,32 @@ stripe_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } UNLOCK (&frame->lock); - + if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, - local->op_errno, &local->statvfs_buf); + STRIPE_STACK_UNWIND (statfs, frame, local->op_ret, + local->op_errno, &local->statvfs_buf, NULL); } - +out: return 0; } - -/** - * stripe_statfs - - */ int32_t -stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; stripe_private_t *priv = NULL; - int32_t op_errno = 1; + int32_t op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); trav = this->children; priv = this->private; /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -688,28 +554,99 @@ stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_statfs_cbk, trav->xlator, - trav->xlator->fops->statfs, loc); + trav->xlator->fops->statfs, loc, NULL); trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } -/** - * stripe_truncate - - */ + int32_t -stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } + + if (op_ret == 0) { + local->op_ret = 0; + if (FIRST_CHILD(this) == prev->this) { + local->pre_buf = *prebuf; + local->post_buf = *postbuf; + } + + local->prebuf_blocks += prebuf->ia_blocks; + local->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + + if (local->prebuf_size < prebuf->ia_size) + local->prebuf_size = prebuf->ia_size; + + if (local->postbuf_size < postbuf->ia_size) + local->postbuf_size = postbuf->ia_size; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret != -1) { + local->pre_buf.ia_blocks = local->prebuf_blocks; + local->pre_buf.ia_size = local->prebuf_size; + local->post_buf.ia_blocks = local->postbuf_blocks; + local->post_buf.ia_size = local->postbuf_size; + } + + STRIPE_STACK_UNWIND (truncate, frame, local->op_ret, + local->op_errno, &local->pre_buf, + &local->post_buf, NULL); + } +out: + return 0; +} + +int32_t +stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { - int send_fop_to_all = 0; - xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - int32_t op_errno = 1; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = EINVAL; + int i, eof_idx; + off_t dest_offset, tmp_offset; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -718,58 +655,157 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) VALIDATE_OR_GOTO (loc->inode, err); priv = this->private; - trav = this->children; if (priv->first_child_down) { op_errno = ENOTCONN; goto err; } - if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) - send_fop_to_all = 1; + /* Initialization */ + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "no stripe context"); + op_errno = EINVAL; + goto err; + } + + local->fctx = fctx; + eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count; + + for (i = 0; i < fctx->stripe_count; i++) { + if (!fctx->xl_array[i]) { + gf_log(this->name, GF_LOG_ERROR, + "no xlator at index %d", i); + op_errno = EINVAL; + goto err; + } + + if (fctx->stripe_coalesce) { + /* + * The node that owns EOF is truncated to the exact + * coalesced offset. Nodes prior to this index should + * be rounded up to the size of the complete stripe, + * while nodes after this index should be rounded down + * to the size of the previous stripe. + */ + if (i < eof_idx) + tmp_offset = roof(offset, fctx->stripe_size * + fctx->stripe_count); + else if (i > eof_idx) + tmp_offset = floor(offset, fctx->stripe_size * + fctx->stripe_count); + else + tmp_offset = offset; + + dest_offset = coalesced_offset(tmp_offset, + fctx->stripe_size, fctx->stripe_count); + } else { + dest_offset = offset; + } + + STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i], + fctx->xl_array[i]->fops->truncate, loc, dest_offset, + NULL); + } - if (!send_fop_to_all) { - STACK_WIND (frame, stripe_common_buf_cbk, trav->xlator, - trav->xlator->fops->truncate, loc, offset); - } else { - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; + return 0; +err: + STRIPE_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int32_t +stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; } - local->op_ret = -1; - frame->local = local; - local->inode = loc->inode; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, - trav->xlator, trav->xlator->fops->truncate, - loc, offset); - trav = trav->next; + + if (op_ret == 0) { + local->op_ret = 0; + + if (FIRST_CHILD(this) == prev->this) { + local->pre_buf = *preop; + local->post_buf = *postop; + } + + local->prebuf_blocks += preop->ia_blocks; + local->postbuf_blocks += postop->ia_blocks; + + correct_file_size(preop, local->fctx, prev); + correct_file_size(postop, local->fctx, prev); + + if (local->prebuf_size < preop->ia_size) + local->prebuf_size = preop->ia_size; + if (local->postbuf_size < postop->ia_size) + local->postbuf_size = postop->ia_size; } } + UNLOCK (&frame->lock); - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret != -1) { + local->pre_buf.ia_blocks = local->prebuf_blocks; + local->pre_buf.ia_size = local->prebuf_size; + local->post_buf.ia_blocks = local->postbuf_blocks; + local->post_buf.ia_size = local->postbuf_size; + } + + STRIPE_STACK_UNWIND (setattr, frame, local->op_ret, + local->op_errno, &local->pre_buf, + &local->post_buf, NULL); + } +out: return 0; } -/** - * stripe_utimens - - */ -int32_t -stripe_utimens (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct timespec tv[2]) +int32_t +stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - int send_fop_to_all = 0; xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - int32_t op_errno = 1; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -785,46 +821,175 @@ stripe_utimens (call_frame_t *frame, xlator_t *this, loc_t *loc, goto err; } - if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) - send_fop_to_all = 1; + /* Initialization */ + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; + frame->local = local; + if (!IA_ISDIR (loc->inode->ia_type) && + !IA_ISREG (loc->inode->ia_type)) { + local->call_count = 1; + STACK_WIND (frame, stripe_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, NULL); + return 0; + } - if (!send_fop_to_all) { - STACK_WIND (frame, stripe_common_buf_cbk, trav->xlator, - trav->xlator->fops->utimens, loc, tv); - } else { - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->inode = loc->inode; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, - trav->xlator, trav->xlator->fops->utimens, - loc, tv); - trav = trav->next; - } + if (IA_ISREG(loc->inode->ia_type)) { + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + + local->call_count = priv->child_count; + while (trav) { + STACK_WIND (frame, stripe_setattr_cbk, + trav->xlator, trav->xlator->fops->setattr, + loc, stbuf, valid, NULL); + trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -int32_t +int32_t +stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = NULL; + xlator_list_t *trav = NULL; + int32_t op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + priv = this->private; + trav = this->children; + + /* Initialization */ + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, stripe_setattr_cbk, trav->xlator, + trav->xlator->fops->fsetattr, fd, stbuf, valid, NULL); + trav = trav->next; + } + + return 0; +err: + STRIPE_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } + + if (op_ret == 0) { + local->op_ret = 0; + + local->stbuf.ia_blocks += buf->ia_blocks; + local->preparent.ia_blocks += preoldparent->ia_blocks; + local->postparent.ia_blocks += postoldparent->ia_blocks; + local->pre_buf.ia_blocks += prenewparent->ia_blocks; + local->post_buf.ia_blocks += postnewparent->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + + if (local->stbuf.ia_size < buf->ia_size) + local->stbuf.ia_size = buf->ia_size; + + if (local->preparent.ia_size < preoldparent->ia_size) + local->preparent.ia_size = preoldparent->ia_size; + + if (local->postparent.ia_size < postoldparent->ia_size) + local->postparent.ia_size = postoldparent->ia_size; + + if (local->pre_buf.ia_size < prenewparent->ia_size) + local->pre_buf.ia_size = prenewparent->ia_size; + + if (local->post_buf.ia_size < postnewparent->ia_size) + local->post_buf.ia_size = postnewparent->ia_size; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + STRIPE_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preparent, + &local->postparent, &local->pre_buf, + &local->post_buf, NULL); + } +out: + return 0; +} + +int32_t stripe_first_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; + if (!this || !frame || !frame->local) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + op_errno = EINVAL; + goto unwind; + } + if (op_ret == -1) { goto unwind; } @@ -832,34 +997,39 @@ stripe_first_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; trav = this->children; + local->stbuf = *buf; + local->preparent = *preoldparent; + local->postparent = *postoldparent; + local->pre_buf = *prenewparent; + local->post_buf = *postnewparent; + local->op_ret = 0; - local->stbuf = *buf; local->call_count--; - trav = trav->next; /* Skip first child */ + trav = trav->next; /* Skip first child */ while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, + STACK_WIND (frame, stripe_stack_rename_cbk, trav->xlator, trav->xlator->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); trav = trav->next; } return 0; - unwind: - STACK_UNWIND (frame, op_ret, op_errno, buf); +unwind: + STRIPE_STACK_UNWIND (rename, frame, -1, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, NULL); return 0; } -/** - * stripe_rename - - */ + int32_t -stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) +stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; xlator_list_t *trav = NULL; - int32_t op_errno = 1; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -878,101 +1048,132 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, } /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } + + frame->local = local; + local->op_ret = -1; - local->inode = oldloc->inode; loc_copy (&local->loc, oldloc); loc_copy (&local->loc2, newloc); local->call_count = priv->child_count; - - frame->local = local; + + if (IA_ISREG(oldloc->inode->ia_type)) { + inode_ctx_get(oldloc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } STACK_WIND (frame, stripe_first_rename_cbk, trav->xlator, - trav->xlator->fops->rename, oldloc, newloc); + trav->xlator->fops->rename, oldloc, newloc, NULL); return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); return 0; } - - -/** - * stripe_access - - */ int32_t -stripe_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) +stripe_first_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int32_t op_errno = 1; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; - STACK_WIND (frame, stripe_common_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->access, loc, mask); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned %s", + prev->this->name, strerror (op_errno)); + goto out; + } + local->op_ret = 0; + local->preparent = *preparent; + local->postparent = *postparent; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + STRIPE_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, xdata); return 0; - err: - STACK_UNWIND (frame, -1, op_errno); +out: + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } -/** - * stripe_readlink_cbk - - */ -int32_t -stripe_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path) -{ - STACK_UNWIND (frame, op_ret, op_errno, path); - return 0; -} -/** - * stripe_readlink - - */ int32_t -stripe_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) +stripe_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int32_t op_errno = 1; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; - STACK_WIND (frame, stripe_readlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, loc, size); + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno != ENOENT) { + local->failed = 1; + local->op_ret = op_ret; + } + } + } + UNLOCK (&frame->lock); + + if (callcnt == 1) { + if (local->failed) { + op_errno = local->op_errno; + goto out; + } + STACK_WIND(frame, stripe_first_unlink_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->unlink, &local->loc, + local->xflag, local->xdata); + } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +out: + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - -/** - * stripe_unlink - - */ int32_t -stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, + int xflag, dict_t *xdata) { - int send_fop_to_all = 0; xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - int32_t op_errno = 1; + int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -987,82 +1188,132 @@ stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) op_errno = ENOTCONN; goto err; } - - if (S_ISREG (loc->inode->st_mode)) - send_fop_to_all = 1; - if (!send_fop_to_all) { - STACK_WIND (frame, stripe_common_cbk, trav->xlator, - trav->xlator->fops->unlink, loc); - } else { - /* Don't unlink a file if a node is down */ - if (priv->nodes_down) { - op_errno = ENOTCONN; - goto err; - } + /* Don't unlink a file if a node is down */ + if (priv->nodes_down) { + op_errno = ENOTCONN; + goto err; + } - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_cbk, - trav->xlator, trav->xlator->fops->unlink, - loc); - trav = trav->next; - } + /* Initialization */ + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; + loc_copy (&local->loc, loc); + local->xflag = xflag; + + if (xdata) + local->xdata = dict_ref (xdata); + + frame->local = local; + local->call_count = priv->child_count; + trav = trav->next; /* Skip the first child */ + + while (trav) { + STACK_WIND (frame, stripe_unlink_cbk, + trav->xlator, trav->xlator->fops->unlink, + loc, xflag, xdata); + trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno); +err: + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -int32_t +int32_t stripe_first_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno,struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - xlator_list_t *trav = NULL; stripe_local_t *local = NULL; + if (!this || !frame || !frame->local) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + op_errno = EINVAL; + goto err; + } + if (op_ret == -1) { - STACK_UNWIND (frame, op_ret, op_errno); - return 0; + goto err; } - trav = this->children; local = frame->local; + local->op_ret = 0; local->call_count--; /* First child successful */ - trav = trav->next; /* Skip first child */ - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_cbk, trav->xlator, - trav->xlator->fops->rmdir, &local->loc); - trav = trav->next; + local->preparent = *preparent; + local->postparent = *postparent; + local->preparent_size = preparent->ia_size; + local->postparent_size = postparent->ia_size; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + STRIPE_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, xdata); + return 0; +err: + STRIPE_STACK_UNWIND (rmdir, frame, op_ret, op_errno, NULL, NULL, NULL); + return 0; + +} + +int32_t +stripe_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned %s", + prev->this->name, strerror (op_errno)); + if (op_errno != ENOENT) + local->failed = 1; + } } + UNLOCK (&frame->lock); + if (callcnt == 1) { + if (local->failed) + goto out; + STACK_WIND (frame, stripe_first_rmdir_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rmdir, &local->loc, + local->flags, NULL); + } + return 0; +out: + STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -/** - * stripe_rmdir - - */ int32_t -stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - int32_t op_errno = 1; + int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -1080,68 +1331,45 @@ stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) } /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; frame->local = local; - local->inode = loc->inode; loc_copy (&local->loc, loc); + local->flags = flags; local->call_count = priv->child_count; - - STACK_WIND (frame, stripe_first_rmdir_cbk, trav->xlator, - trav->xlator->fops->rmdir, loc); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno); - return 0; -} - - -/** - * stripe_setxattr - - */ -int32_t -stripe_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *dict, int32_t flags) -{ - stripe_private_t *priv = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; + trav = trav->next; /* skip the first child */ - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; + while (trav) { + STACK_WIND (frame, stripe_rmdir_cbk, trav->xlator, + trav->xlator->fops->rmdir, loc, flags, NULL); + trav = trav->next; } - STACK_WIND (frame, stripe_common_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, loc, dict, flags); - return 0; - err: - STACK_UNWIND (frame, -1, op_errno); +err: + STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -int32_t +int32_t stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; + if (!this || !frame || !frame->local) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + local = frame->local; LOCK (&frame->lock); @@ -1151,11 +1379,11 @@ stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie, UNLOCK (&frame->lock); if (!callcnt) { - loc_wipe (&local->loc); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf); + STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, + &local->preparent, &local->postparent, NULL); } - +out: return 0; } @@ -1165,22 +1393,31 @@ stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie, int32_t stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + call_frame_t *prev = NULL; - LOCK (&frame->lock); + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); { callcnt = --local->call_count; - + if (op_ret == -1) { gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); + prev->this->name, strerror (op_errno)); local->op_ret = -1; local->op_errno = op_errno; } @@ -1195,141 +1432,239 @@ stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, stripe_mknod_ifreg_fail_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - &local->loc); + &local->loc, 0, NULL); trav = trav->next; } return 0; } - loc_wipe (&local->loc); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf); + STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, + &local->preparent, &local->postparent, NULL); } +out: return 0; } -/** - */ int32_t stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int ret = 0; int32_t callcnt = 0; stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; stripe_private_t *priv = NULL; + call_frame_t *prev = NULL; + xlator_list_t *trav = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + prev = cookie; + priv = this->private; local = frame->local; LOCK (&frame->lock); { callcnt = --local->call_count; - + if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->failed = 1; + prev->this->name, strerror (op_errno)); + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; local->op_errno = op_errno; } - if (op_ret >= 0) { local->op_ret = op_ret; - /* Get the mapping in inode private */ - /* Get the stat buf right */ - if (local->stbuf.st_blksize == 0) { - local->stbuf = *buf; - /* Because st_blocks gets added again */ - local->stbuf.st_blocks = 0; - } - /* Always, pass the inode number of first child - to the above layer */ - if (FIRST_CHILD(this) == - ((call_frame_t *)cookie)->this) - local->stbuf.st_ino = buf->st_ino; - - local->stbuf.st_blocks += buf->st_blocks; - if (local->stbuf.st_size < buf->st_size) - local->stbuf.st_size = buf->st_size; - if (local->stbuf.st_blksize != buf->st_blksize) { - /* TODO: add to blocks in terms of - original block size */ - } + /* Can be used as a mechanism to understand if mknod + was successful in at least one place */ + if (uuid_is_null (local->ia_gfid)) + uuid_copy (local->ia_gfid, buf->ia_gfid); + + if (stripe_ctx_handle(this, prev, local, xdata)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from dict"); + + local->stbuf_blocks += buf->ia_blocks; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + if (local->preparent_size < preparent->ia_size) + local->preparent_size = preparent->ia_size; + if (local->postparent_size < postparent->ia_size) + local->postparent_size = postparent->ia_size; } } UNLOCK (&frame->lock); if (!callcnt) { - if (local->failed) + if (local->failed) local->op_ret = -1; - if ((local->op_ret != -1) && priv->xattr_supported) { - /* Send a setxattr request to nodes where the - files are created */ - int32_t index = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; - dict_t *dict = NULL; - - trav = this->children; - sprintf (size_key, - "trusted.%s.stripe-size", this->name); - sprintf (count_key, - "trusted.%s.stripe-count", this->name); - sprintf (index_key, - "trusted.%s.stripe-index", this->name); - + if ((local->op_ret == -1) && !uuid_is_null (local->ia_gfid)) { + /* ia_gfid set means, at least on one node 'mknod' + is successful */ local->call_count = priv->child_count; - + trav = this->children; while (trav) { - dict = get_new_dict (); - dict_ref (dict); - /* TODO: check return value */ - ret = dict_set_int64 (dict, size_key, - local->stripe_size); - ret = dict_set_int32 (dict, count_key, - priv->child_count); - ret = dict_set_int32 (dict, index_key, index); - STACK_WIND (frame, - stripe_mknod_ifreg_setxattr_cbk, + stripe_mknod_ifreg_fail_unlink_cbk, trav->xlator, - trav->xlator->fops->setxattr, - &local->loc, dict, 0); - - dict_unref (dict); - index++; + trav->xlator->fops->unlink, + &local->loc, 0, NULL); trav = trav->next; } + return 0; + } + + + if (local->op_ret != -1) { + local->preparent.ia_blocks = local->preparent_blocks; + local->preparent.ia_size = local->preparent_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + inode_ctx_put (local->inode, this, + (uint64_t)(long) local->fctx); + + } + STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, + &local->preparent, &local->postparent, NULL); + } +out: + return 0; +} + + +int32_t +stripe_mknod_first_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = NULL; + call_frame_t *prev = NULL; + xlator_list_t *trav = NULL; + int i = 1; + dict_t *dict = NULL; + int ret = 0; + int need_unref = 0; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + priv = this->private; + local = frame->local; + trav = this->children; + + local->call_count--; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->failed = 1; + local->op_errno = op_errno; + goto out; + } + + local->op_ret = op_ret; + + local->stbuf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; + + if (uuid_is_null (local->ia_gfid)) + uuid_copy (local->ia_gfid, buf->ia_gfid); + local->preparent.ia_blocks = local->preparent_blocks; + local->preparent.ia_size = local->preparent_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + + trav = trav->next; + while (trav) { + if (priv->xattr_supported) { + dict = dict_new (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate dict %s", local->loc.path); + } + need_unref = 1; + + dict_copy (local->xattr, dict); + + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, i, + priv->coalesce); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to build xattr request"); + } else { - /* Create itself has failed.. so return - without setxattring */ - loc_wipe (&local->loc); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf); + dict = local->xattr; } + + STACK_WIND (frame, stripe_mknod_ifreg_cbk, + trav->xlator, trav->xlator->fops->mknod, + &local->loc, local->mode, local->rdev, 0, dict); + trav = trav->next; + i++; + + if (dict && need_unref) + dict_unref (dict); } - + return 0; + +out: + + STRIPE_STACK_UNWIND (mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } -/** - * stripe_mknod - - */ int32_t +stripe_single_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev) + dev_t rdev, mode_t umask, dict_t *xdata) { - stripe_private_t *priv = NULL; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + stripe_private_t *priv = NULL; + stripe_local_t *local = NULL; + int32_t op_errno = EINVAL; + int32_t i = 0; + dict_t *dict = NULL; + int ret = 0; + int need_unref = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -1338,71 +1673,216 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, VALIDATE_OR_GOTO (loc->inode, err); priv = this->private; - trav = this->children; - + if (priv->first_child_down) { op_errno = ENOTCONN; goto err; } if (S_ISREG(mode)) { - /* NOTE: on older kernels (older than 2.6.9), - creat() fops is sent as mknod() + open(). Hence handling + /* NOTE: on older kernels (older than 2.6.9), + creat() fops is sent as mknod() + open(). Hence handling S_IFREG files is necessary */ if (priv->nodes_down) { - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_WARNING, "Some node down, returning EIO"); op_errno = EIO; goto err; } - + /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->op_errno = ENOTCONN; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); + local->stripe_size = stripe_get_matching_bs (loc->path, priv); frame->local = local; - local->inode = loc->inode; + local->inode = inode_ref (loc->inode); loc_copy (&local->loc, loc); + local->xattr = dict_copy_with_ref (xdata, NULL); + local->mode = mode; + local->umask = umask; + local->rdev = rdev; /* Everytime in stripe lookup, all child nodes should be looked up */ local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_mknod_ifreg_cbk, - trav->xlator, trav->xlator->fops->mknod, - loc, mode, rdev); - trav = trav->next; + + if (priv->xattr_supported) { + dict = dict_new (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate dict %s", loc->path); + } + need_unref = 1; + + dict_copy (xdata, dict); + + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "failed to build xattr request"); + } else { + dict = xdata; } - /* This case is handled, no need to continue further. */ - return 0; - } + STACK_WIND (frame, stripe_mknod_first_ifreg_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->mknod, + loc, mode, rdev, umask, dict); + if (dict && need_unref) + dict_unref (dict); + return 0; + } - STACK_WIND (frame, stripe_common_inode_cbk, + STACK_WIND (frame, stripe_single_mknod_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev); + loc, mode, rdev, umask, xdata); return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); +err: + STRIPE_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; +} + + +int32_t +stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } + + if (op_ret >= 0) { + local->op_ret = 0; + + local->stbuf_blocks += buf->ia_blocks; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + if (local->preparent_size < preparent->ia_size) + local->preparent_size = preparent->ia_size; + if (local->postparent_size < postparent->ia_size) + local->postparent_size = postparent->ia_size; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed != -1) { + local->preparent.ia_blocks = local->preparent_blocks; + local->preparent.ia_size = local->preparent_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + } + STRIPE_STACK_UNWIND (mkdir, frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, &local->preparent, + &local->postparent, NULL); + } +out: return 0; } -/** - * stripe_mkdir - - */ int32_t -stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode) +stripe_first_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_list_t *trav = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + trav = this->children; + + local->call_count--; /* first child is successful */ + trav = trav->next; /* skip first child */ + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + goto out; + } + + local->op_ret = 0; + + local->inode = inode_ref (inode); + local->stbuf = *buf; + local->postparent = *postparent; + local->preparent = *preparent; + + local->stbuf_blocks += buf->ia_blocks; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + local->stbuf_size = buf->ia_size; + local->preparent_size = preparent->ia_size; + local->postparent_size = postparent->ia_size; + + while (trav) { + STACK_WIND (frame, stripe_mkdir_cbk, trav->xlator, + trav->xlator->fops->mkdir, &local->loc, local->mode, + local->umask, local->xdata); + trav = trav->next; + } + return 0; +out: + STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); + + return 0; + +} + + +int +stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; @@ -1417,72 +1897,130 @@ stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode) priv = this->private; trav = this->children; - + if (priv->first_child_down) { op_errno = ENOTCONN; goto err; } /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->call_count = priv->child_count; + if (xdata) + local->xdata = dict_ref (xdata); + local->mode = mode; + local->umask = umask; + loc_copy (&local->loc, loc); frame->local = local; /* Everytime in stripe lookup, all child nodes should be looked up */ - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_inode_cbk, - trav->xlator, trav->xlator->fops->mkdir, - loc, mode); - trav = trav->next; - } + STACK_WIND (frame, stripe_first_mkdir_cbk, trav->xlator, + trav->xlator->fops->mkdir, loc, mode, umask, xdata); return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); +err: + STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } -/** - * stripe_symlink - - */ int32_t -stripe_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, - loc_t *loc) +stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int32_t op_errno = 1; - stripe_private_t *priv = NULL; - - priv = this->private; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + stripe_fd_ctx_t *fctx = NULL; - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; } - /* send symlink to only first node */ - STACK_WIND (frame, stripe_common_inode_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, linkpath, loc); + prev = cookie; + local = frame->local; - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } + if (op_ret >= 0) { + local->op_ret = 0; + + if (IA_ISREG(inode->ia_type)) { + inode_ctx_get(inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, + "failed to get stripe context"); + op_ret = -1; + op_errno = EINVAL; + } + } + + if (FIRST_CHILD(this) == prev->this) { + local->inode = inode_ref (inode); + local->stbuf = *buf; + local->postparent = *postparent; + local->preparent = *preparent; + } + local->stbuf_blocks += buf->ia_blocks; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + correct_file_size(buf, fctx, prev); + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + if (local->preparent_size < preparent->ia_size) + local->preparent_size = preparent->ia_size; + if (local->postparent_size < postparent->ia_size) + local->postparent_size = postparent->ia_size; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret != -1) { + local->preparent.ia_blocks = local->preparent_blocks; + local->preparent.ia_size = local->preparent_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + } + STRIPE_STACK_UNWIND (link, frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, &local->preparent, + &local->postparent, NULL); + } +out: return 0; } -/** - * stripe_link - - */ int32_t -stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) +stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - int send_fop_to_all = 0; xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -1503,49 +2041,45 @@ stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) goto err; } - if (S_ISREG (oldloc->inode->st_mode)) - send_fop_to_all = 1; + /* Initialization */ + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; - if (!send_fop_to_all) { - STACK_WIND (frame, stripe_common_inode_cbk, + /* Everytime in stripe lookup, all child + nodes should be looked up */ + while (trav) { + STACK_WIND (frame, stripe_link_cbk, trav->xlator, trav->xlator->fops->link, - oldloc, newloc); - } else { - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - /* Everytime in stripe lookup, all child - nodes should be looked up */ - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_inode_cbk, - trav->xlator, trav->xlator->fops->link, - oldloc, newloc); - trav = trav->next; - } + oldloc, newloc, NULL); + trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); +err: + STRIPE_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } -int32_t +int32_t stripe_create_fail_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; - fd_t *lfd = NULL; stripe_local_t *local = NULL; + if (!this || !frame || !frame->local) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + local = frame->local; LOCK (&frame->lock); @@ -1555,47 +2089,78 @@ stripe_create_fail_unlink_cbk (call_frame_t *frame, void *cookie, UNLOCK (&frame->lock); if (!callcnt) { - lfd = local->fd; - loc_wipe (&local->loc); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->fd, local->inode, &local->stbuf); - fd_unref (lfd); + STRIPE_STACK_UNWIND (create, frame, local->op_ret, local->op_errno, + local->fd, local->inode, &local->stbuf, + &local->preparent, &local->postparent, NULL); } +out: return 0; } -/** - * stripe_create_setxattr_cbk - - */ int32_t -stripe_create_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - fd_t *lfd = NULL; + int32_t callcnt = 0; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + call_frame_t *prev = NULL; xlator_list_t *trav = NULL; - int32_t callcnt = 0; + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + priv = this->private; local = frame->local; LOCK (&frame->lock); { callcnt = --local->call_count; - + if (op_ret == -1) { gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->op_ret = -1; + prev->this->name, strerror (op_errno)); + local->failed = 1; local->op_errno = op_errno; } + + if (op_ret >= 0) { + if (IA_ISREG(buf->ia_type)) { + if (stripe_ctx_handle(this, prev, local, xdata)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from " + "dict"); + } + + local->op_ret = op_ret; + + local->stbuf_blocks += buf->ia_blocks; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + if (local->preparent_size < preparent->ia_size) + local->preparent_size = preparent->ia_size; + if (local->postparent_size < postparent->ia_size) + local->postparent_size = postparent->ia_size; + } } UNLOCK (&frame->lock); if (!callcnt) { + if (local->failed) + local->op_ret = -1; + if (local->op_ret == -1) { local->call_count = priv->child_count; trav = this->children; @@ -1604,165 +2169,180 @@ stripe_create_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stripe_create_fail_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - &local->loc); + &local->loc, 0, NULL); trav = trav->next; } - + return 0; } - lfd = local->fd; - loc_wipe (&local->loc); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->fd, local->inode, &local->stbuf); - fd_unref (lfd); + if (local->op_ret >= 0) { + local->preparent.ia_blocks = local->preparent_blocks; + local->preparent.ia_size = local->preparent_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + + stripe_copy_xl_array(local->fctx->xl_array, + priv->xl_array, + local->fctx->stripe_count); + inode_ctx_put(local->inode, this, + (uint64_t) local->fctx); + } + + /* Create itself has failed.. so return + without setxattring */ + STRIPE_STACK_UNWIND (create, frame, local->op_ret, + local->op_errno, local->fd, + local->inode, &local->stbuf, + &local->preparent, &local->postparent, NULL); } +out: return 0; } -/** - * stripe_create_cbk - - */ + + int32_t -stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +stripe_first_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, - inode_t *inode, struct stat *buf) + inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int32_t callcnt = 0; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - fd_t *lfd = NULL; - stripe_fd_ctx_t *fctx = NULL; + call_frame_t *prev = NULL; + xlator_list_t *trav = NULL; + int i = 1; + dict_t *dict = NULL; + loc_t *loc = NULL; + int32_t need_unref = 0; + int32_t ret = -1; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + prev = cookie; priv = this->private; local = frame->local; + trav = this->children; + loc = &local->loc; - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->failed = 1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - /* Get the mapping in inode private */ - /* Get the stat buf right */ - if (local->stbuf.st_blksize == 0) { - local->stbuf = *buf; - /* Because st_blocks gets added again */ - local->stbuf.st_blocks = 0; - } - - /* Always, pass the inode number of first - child to the above layer */ - if (FIRST_CHILD(this) == - ((call_frame_t *)cookie)->this) - local->stbuf.st_ino = buf->st_ino; - - local->stbuf.st_blocks += buf->st_blocks; - if (local->stbuf.st_size < buf->st_size) - local->stbuf.st_size = buf->st_size; - if (local->stbuf.st_blksize != buf->st_blksize) { - /* TODO: add to blocks in terms of - original block size */ - } - } + --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->failed = 1; + local->op_errno = op_errno; } - UNLOCK (&frame->lock); - if (!callcnt) { - if (local->failed) - local->op_ret = -1; + local->op_ret = 0; + /* Get the mapping in inode private */ + /* Get the stat buf right */ + local->stbuf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; + + local->stbuf_blocks += buf->ia_blocks; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + if (local->preparent_size < preparent->ia_size) + local->preparent_size = preparent->ia_size; + if (local->postparent_size < postparent->ia_size) + local->postparent_size = postparent->ia_size; + + if (local->failed) + local->op_ret = -1; - /* */ - if (local->op_ret >= 0) { - fctx = CALLOC (1, sizeof (stripe_fd_ctx_t)); - if (fctx) { - fctx->stripe_size = local->stripe_size; - fctx->stripe_count = priv->child_count; - fctx->static_array = 1; - fctx->xl_array = priv->xl_array; - fd_ctx_set (local->fd, this, - (uint64_t)(long)fctx); - } - } + if (local->op_ret == -1) { + local->call_count = 1; + STACK_WIND (frame, stripe_create_fail_unlink_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->unlink, + &local->loc, 0, NULL); + return 0; + } - if ((local->op_ret != -1) && - local->stripe_size && priv->xattr_supported) { - /* Send a setxattr request to nodes where - the files are created */ - int ret = 0; - int32_t i = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; - dict_t *dict = NULL; - - sprintf (size_key, - "trusted.%s.stripe-size", this->name); - sprintf (count_key, - "trusted.%s.stripe-count", this->name); - sprintf (index_key, - "trusted.%s.stripe-index", this->name); + if (local->op_ret >= 0) { + local->preparent.ia_blocks = local->preparent_blocks; + local->preparent.ia_size = local->preparent_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + } - local->call_count = priv->child_count; - - for (i = 0; i < priv->child_count; i++) { - dict = get_new_dict (); - dict_ref (dict); - - /* TODO: check return values */ - ret = dict_set_int64 (dict, size_key, - local->stripe_size); - ret = dict_set_int32 (dict, count_key, - priv->child_count); - ret = dict_set_int32 (dict, index_key, i); - - STACK_WIND (frame, stripe_create_setxattr_cbk, - priv->xl_array[i], - priv->xl_array[i]->fops->setxattr, - &local->loc, dict, 0); - - dict_unref (dict); + /* Send a setxattr request to nodes where the + files are created */ + trav = trav->next; + while (trav) { + if (priv->xattr_supported) { + dict = dict_new (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate dict %s", loc->path); } + need_unref = 1; + + dict_copy (local->xattr, dict); + + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "failed to build xattr request"); } else { - /* Create itself has failed.. so return - without setxattring */ - lfd = local->fd; - loc_wipe (&local->loc); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->fd, local->inode, &local->stbuf); - - fd_unref (lfd); + dict = local->xattr; } + + STACK_WIND (frame, stripe_create_cbk, trav->xlator, + trav->xlator->fops->create, &local->loc, + local->flags, local->mode, local->umask, local->fd, + dict); + trav = trav->next; + if (need_unref && dict) + dict_unref (dict); + i++; } - + +out: return 0; } + /** - * stripe_create - If a block-size is specified for the 'name', create the + * stripe_create - If a block-size is specified for the 'name', create the * file in all the child nodes. If not, create it in only first child. * * @name- complete path of the file to be created. */ int32_t stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, mode_t mode, fd_t *fd) + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + int32_t op_errno = EINVAL; + int ret = 0; + int need_unref = 0; + int i = 0; + dict_t *dict = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); priv = this->private; @@ -1770,121 +2350,86 @@ stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, flags &= ~O_APPEND; if (priv->first_child_down || priv->nodes_down) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_DEBUG, "First node down, returning EIO"); op_errno = EIO; goto err; } /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->op_errno = ENOTCONN; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); + local->stripe_size = stripe_get_matching_bs (loc->path, priv); frame->local = local; - local->inode = loc->inode; + local->inode = inode_ref (loc->inode); loc_copy (&local->loc, loc); local->fd = fd_ref (fd); + local->flags = flags; + local->mode = mode; + local->umask = umask; + if (xdata) + local->xattr = dict_ref (xdata); local->call_count = priv->child_count; - - trav = this->children; - while (trav) { - STACK_WIND (frame, stripe_create_cbk, trav->xlator, - trav->xlator->fops->create, loc, flags, mode, fd); - trav = trav->next; - } - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); - return 0; -} - -/** - * stripe_open_cbk - - */ -int32_t -stripe_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - fd_t *lfd = NULL; + /* Send a setxattr request to nodes where the + files are created */ - local = frame->local; + if (priv->xattr_supported) { + dict = dict_new (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate dict %s", loc->path); + } + need_unref = 1; - LOCK (&frame->lock); - { - callcnt = --local->call_count; + dict_copy (xdata, dict); - if (op_ret == -1) { - local->failed = 1; - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) - local->op_ret = op_ret; + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "failed to build xattr request"); + } else { + dict = xdata; } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - if (local->op_ret == -1) { - if (local->fctx) { - if (!local->fctx->static_array) - FREE (local->fctx->xl_array); - FREE (local->fctx); - } - } else { - fd_ctx_set (local->fd, this, - (uint64_t)(long)local->fctx); - } - lfd = local->fd; - loc_wipe (&local->loc); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->fd); - fd_unref (lfd); + STACK_WIND (frame, stripe_first_create_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->create, loc, flags, mode, + umask, fd, dict); + + if (need_unref && dict) + dict_unref (dict); - } return 0; +err: + STRIPE_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, xdata); + return 0; } - -/** - * stripe_getxattr_cbk - - */ int32_t -stripe_open_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) +stripe_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - int32_t index = 0; - int32_t callcnt = 0; - char key[256] = {0,}; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - data_t *data = NULL; - call_frame_t *prev = NULL; - fd_t *lfd = NULL; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; - prev = (call_frame_t *)cookie; - priv = this->private; + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; local = frame->local; LOCK (&frame->lock); @@ -1892,145 +2437,39 @@ stripe_open_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, callcnt = --local->call_count; if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, + + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->op_ret = -1; - if (local->op_errno != EIO) - local->op_errno = op_errno; - if (op_errno == ENOTCONN) + prev->this->name, strerror (op_errno)); + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) local->failed = 1; - goto unlock; - } - - if (!local->fctx) { - local->fctx = CALLOC (1, sizeof (stripe_fd_ctx_t)); - if (!local->fctx) { - local->op_errno = ENOMEM; - local->op_ret = -1; - goto unlock; - } - - local->fctx->static_array = 0; - } - /* Stripe block size */ - sprintf (key, "trusted.%s.stripe-size", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - } else { - if (!local->fctx->stripe_size) { - local->fctx->stripe_size = - data_to_int64 (data); - } - - if (local->fctx->stripe_size != data_to_int64 (data)) { - gf_log (this->name, GF_LOG_DEBUG, - "stripe-size mismatch in blocks"); - local->xattr_self_heal_needed = 1; - } - } - /* Stripe count */ - sprintf (key, "trusted.%s.stripe-count", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - goto unlock; - } - if (!local->fctx->xl_array) { - local->fctx->stripe_count = data_to_int32 (data); - if (!local->fctx->stripe_count) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-count xattr"); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - local->fctx->xl_array = - CALLOC (local->fctx->stripe_count, - sizeof (xlator_t *)); - } - if (local->fctx->stripe_count != data_to_int32 (data)) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-count xattr"); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; + local->op_errno = op_errno; } - /* index */ - sprintf (key, "trusted.%s.stripe-index", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - goto unlock; - } - index = data_to_int32 (data); - if (index > priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-index xattr"); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - if (local->fctx->xl_array) - local->fctx->xl_array[index] = prev->this; - local->entry_count++; - local->op_ret = 0; + if (op_ret >= 0) + local->op_ret = op_ret; } - unlock: UNLOCK (&frame->lock); - - if (!callcnt) { - /* TODO: if self-heal flag is set, do it */ - if (local->xattr_self_heal_needed) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: stripe info need to be healed", - local->loc.path); - } - - if (local->op_ret) - goto err; - if (local->entry_count != local->fctx->stripe_count) { - local->op_ret = -1; - local->op_errno = EIO; - goto err; - } - if (!local->fctx->stripe_size) { + if (!callcnt) { + if (local->failed) local->op_ret = -1; - local->op_errno = EIO; - goto err; - } - local->call_count = local->fctx->stripe_count; - - trav = this->children; - while (trav) { - STACK_WIND (frame, stripe_open_cbk, trav->xlator, - trav->xlator->fops->open, &local->loc, - local->flags, local->fd); - trav = trav->next; - } + STRIPE_STACK_UNWIND (open, frame, local->op_ret, + local->op_errno, local->fd, xdata); } - - return 0; - err: - lfd = local->fd; - loc_wipe (&local->loc); - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd); - fd_unref (lfd); - +out: return 0; } + /** - * stripe_open - + * stripe_open - */ int32_t stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd) + int32_t flags, fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -2052,7 +2491,7 @@ stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, } /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2063,97 +2502,76 @@ stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, local->fd = fd_ref (fd); frame->local = local; - local->inode = loc->inode; loc_copy (&local->loc, loc); /* Striped files */ local->flags = flags; local->call_count = priv->child_count; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); - - if (priv->xattr_supported) { - while (trav) { - STACK_WIND (frame, stripe_open_getxattr_cbk, - trav->xlator, trav->xlator->fops->getxattr, - loc, NULL); - trav = trav->next; - } - } else { - local->fctx = CALLOC (1, sizeof (stripe_fd_ctx_t)); - if (!local->fctx) { - op_errno = ENOMEM; - goto err; - } - - local->fctx->static_array = 1; - local->fctx->stripe_size = local->stripe_size; - local->fctx->stripe_count = priv->child_count; - local->fctx->xl_array = priv->xl_array; - - while (trav) { - STACK_WIND (frame, stripe_open_cbk, trav->xlator, - trav->xlator->fops->open, - &local->loc, local->flags, local->fd); - trav = trav->next; - } - } + local->stripe_size = stripe_get_matching_bs (loc->path, priv); + while (trav) { + STACK_WIND (frame, stripe_open_cbk, trav->xlator, + trav->xlator->fops->open, + &local->loc, local->flags, local->fd, + xdata); + trav = trav->next; + } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL); return 0; } -/** - * stripe_opendir_cbk - - */ + int32_t stripe_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { int32_t callcnt = 0; - stripe_local_t *local = frame->local; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; LOCK (&frame->lock); { callcnt = --local->call_count; if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); + prev->this->name, strerror (op_errno)); local->op_ret = -1; - local->failed = 1; local->op_errno = op_errno; } - - if (op_ret >= 0) + + if (op_ret >= 0) local->op_ret = op_ret; } UNLOCK (&frame->lock); if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->fd); + STRIPE_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd, NULL); } - +out: return 0; } -/** - * stripe_opendir - - */ int32_t -stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - int32_t op_errno = 1; + int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -2170,118 +2588,61 @@ stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) } /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } frame->local = local; - local->inode = loc->inode; - local->fd = fd; local->call_count = priv->child_count; + local->fd = fd_ref (fd); while (trav) { STACK_WIND (frame, stripe_opendir_cbk, trav->xlator, - trav->xlator->fops->opendir, loc, fd); + trav->xlator->fops->opendir, loc, fd, NULL); trav = trav->next; } - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); - return 0; -} - -/** - * stripe_getxattr_cbk - - */ -int32_t -stripe_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *value) -{ - STACK_UNWIND (frame, op_ret, op_errno, value); return 0; -} - - -/** - * stripe_getxattr - - */ -int32_t -stripe_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) -{ - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - STACK_WIND (frame, stripe_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL); return 0; } -/** - * stripe_removexattr - - */ -int32_t -stripe_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) -{ - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - STACK_WIND (frame, stripe_common_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, loc, name); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno); - return 0; -} - - -/** - * stripe_lk_cbk - - */ int32_t stripe_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; local = frame->local; LOCK (&frame->lock); { callcnt = --local->call_count; if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); + prev->this->name, strerror (op_errno)); local->op_errno = op_errno; - if (op_errno == ENOTCONN) + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) local->failed = 1; } - if (op_ret == 0 && local->op_ret == -1) { - /* First successful call, copy the *lock */ - local->op_ret = 0; - local->lock = *lock; + if (op_ret >= 0) { + if (FIRST_CHILD(this) == prev->this) { + /* First successful call, copy the *lock */ + local->op_ret = op_ret; + local->lock = *lock; + } } } UNLOCK (&frame->lock); @@ -2289,24 +2650,21 @@ stripe_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!callcnt) { if (local->failed) local->op_ret = -1; - STACK_UNWIND (frame, local->op_ret, - local->op_errno, &local->lock); + STRIPE_STACK_UNWIND (lk, frame, local->op_ret, + local->op_errno, &local->lock, NULL); } +out: return 0; } - -/** - * stripe_lk - - */ int32_t stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct flock *lock) + struct gf_flock *lock, dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; stripe_private_t *priv = NULL; - int32_t op_errno = 1; + int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -2317,77 +2675,75 @@ stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, priv = this->private; /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; frame->local = local; - local->call_count = priv->child_count; - + while (trav) { STACK_WIND (frame, stripe_lk_cbk, trav->xlator, - trav->xlator->fops->lk, fd, cmd, lock); + trav->xlator->fops->lk, fd, cmd, lock, NULL); trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); return 0; } -/** - * stripe_writedir - - */ + int32_t -stripe_setdents (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t flags, dir_entry_t *entries, int32_t count) +stripe_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } - priv = this->private; - trav = this->children; + prev = cookie; + local = frame->local; - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; + LOCK (&frame->lock); + { + callcnt = --local->call_count; - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_cbk, trav->xlator, - trav->xlator->fops->setdents, fd, flags, entries, - count); - trav = trav->next; + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } + if (op_ret >= 0) + local->op_ret = op_ret; } + UNLOCK (&frame->lock); - return 0; - err: - STACK_UNWIND (frame, -1, op_errno); + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + STRIPE_STACK_UNWIND (flush, frame, local->op_ret, + local->op_errno, NULL); + } +out: return 0; } - -/** - * stripe_flush - - */ int32_t -stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -2407,7 +2763,7 @@ stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) goto err; } /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2415,73 +2771,98 @@ stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) local->op_ret = -1; frame->local = local; local->call_count = priv->child_count; - + while (trav) { - STACK_WIND (frame, stripe_stack_unwind_cbk, trav->xlator, - trav->xlator->fops->flush, fd); + STACK_WIND (frame, stripe_flush_cbk, trav->xlator, + trav->xlator->fops->flush, fd, NULL); trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno); +err: + STRIPE_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } -/** - * stripe_fsync - - */ + int32_t -stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) +stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } - priv = this->private; - trav = this->children; + prev = cookie; + local = frame->local; - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_cbk, trav->xlator, - trav->xlator->fops->fsync, fd, flags); - trav = trav->next; + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } + if (op_ret >= 0) { + local->op_ret = op_ret; + if (FIRST_CHILD(this) == prev->this) { + local->pre_buf = *prebuf; + local->post_buf = *postbuf; + } + local->prebuf_blocks += prebuf->ia_blocks; + local->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + + if (local->prebuf_size < prebuf->ia_size) + local->prebuf_size = prebuf->ia_size; + + if (local->postbuf_size < postbuf->ia_size) + local->postbuf_size = postbuf->ia_size; + } } + UNLOCK (&frame->lock); - return 0; - err: - STACK_UNWIND (frame, -1, op_errno); + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret != -1) { + local->pre_buf.ia_blocks = local->prebuf_blocks; + local->pre_buf.ia_size = local->prebuf_size; + local->post_buf.ia_blocks = local->postbuf_blocks; + local->post_buf.ia_size = local->postbuf_size; + } + + STRIPE_STACK_UNWIND (fsync, frame, local->op_ret, + local->op_errno, &local->pre_buf, + &local->post_buf, NULL); + } +out: return 0; } - -/** - * stripe_fstat - - */ int32_t -stripe_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) +stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); @@ -2493,82 +2874,107 @@ stripe_fstat (call_frame_t *frame, trav = this->children; /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } - local->op_ret = -1; + frame->local = local; - local->inode = fd->inode; + + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) { + op_errno = EINVAL; + goto err; + } + local->fctx = fctx; + local->op_ret = -1; local->call_count = priv->child_count; - + while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, trav->xlator, - trav->xlator->fops->fstat, fd); + STACK_WIND (frame, stripe_fsync_cbk, trav->xlator, + trav->xlator->fops->fsync, fd, flags, NULL); trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); return 0; } - -/** - * stripe_fchmod - - */ -int32_t -stripe_fchmod (call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode) +int32_t +stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } - priv = this->private; - trav = this->children; + prev = cookie; + local = frame->local; - /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); - if (!local) { - op_errno = ENOMEM; - goto err; + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } + + if (op_ret == 0) { + local->op_ret = 0; + + if (FIRST_CHILD(this) == prev->this) + local->stbuf = *buf; + + local->stbuf_blocks += buf->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + } } - local->op_ret = -1; - frame->local = local; - local->inode = fd->inode; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, trav->xlator, - trav->xlator->fops->fchmod, fd, mode); - trav = trav->next; + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret != -1) { + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + } + + STRIPE_STACK_UNWIND (fstat, frame, local->op_ret, + local->op_errno, &local->stbuf, NULL); } - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +out: return 0; } - -/** - * stripe_fchown - - */ -int32_t -stripe_fchown (call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, - gid_t gid) +int32_t +stripe_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); @@ -2580,39 +2986,44 @@ stripe_fchown (call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, trav = this->children; /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; frame->local = local; - local->inode = fd->inode; local->call_count = priv->child_count; - + + if (IA_ISREG(fd->inode->ia_type)) { + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, trav->xlator, - trav->xlator->fops->fchown, fd, uid, gid); + STACK_WIND (frame, stripe_fstat_cbk, trav->xlator, + trav->xlator->fops->fstat, fd, NULL); trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } -/** - * stripe_ftruncate - - */ int32_t -stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + stripe_fd_ctx_t *fctx = NULL; + int i, eof_idx; + off_t dest_offset, tmp_offset; + int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -2620,37 +3031,115 @@ stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) VALIDATE_OR_GOTO (fd->inode, err); priv = this->private; - trav = this->children; /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; frame->local = local; - local->inode = fd->inode; local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_stack_unwind_buf_cbk, trav->xlator, - trav->xlator->fops->ftruncate, fd, offset); - trav = trav->next; - } + + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "no stripe context"); + op_errno = EINVAL; + goto err; + } + if (!fctx->stripe_count) { + gf_log(this->name, GF_LOG_ERROR, "no stripe count"); + op_errno = EINVAL; + goto err; + } + + local->fctx = fctx; + eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count; + + for (i = 0; i < fctx->stripe_count; i++) { + if (!fctx->xl_array[i]) { + gf_log(this->name, GF_LOG_ERROR, "no xlator at index " + "%d", i); + op_errno = EINVAL; + goto err; + } + + if (fctx->stripe_coalesce) { + if (i < eof_idx) + tmp_offset = roof(offset, fctx->stripe_size * + fctx->stripe_count); + else if (i > eof_idx) + tmp_offset = floor(offset, fctx->stripe_size * + fctx->stripe_count); + else + tmp_offset = offset; + + dest_offset = coalesced_offset(tmp_offset, + fctx->stripe_size, fctx->stripe_count); + } else { + dest_offset = offset; + } + + STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i], + fctx->xl_array[i]->fops->ftruncate, fd, dest_offset, + NULL); + } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -/** - * stripe_fsyncdir - - */ int32_t -stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) +stripe_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s returned %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + if ((op_errno != ENOENT) || + (prev->this == FIRST_CHILD (this))) + local->failed = 1; + } + if (op_ret >= 0) + local->op_ret = op_ret; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + STRIPE_STACK_UNWIND (fsyncdir, frame, local->op_ret, + local->op_errno, NULL); + } +out: + return 0; +} + +int32_t +stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -2666,7 +3155,7 @@ stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) trav = this->children; /* Initialization */ - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2676,29 +3165,114 @@ stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) local->call_count = priv->child_count; while (trav) { - STACK_WIND (frame, stripe_stack_unwind_cbk, trav->xlator, - trav->xlator->fops->fsyncdir, fd, flags); + STACK_WIND (frame, stripe_fsyncdir_cbk, trav->xlator, + trav->xlator->fops->fsyncdir, fd, flags, NULL); trav = trav->next; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + STRIPE_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } -/** - * stripe_single_readv_cbk - This function is used as return fn, when the - * file name doesn't match the pattern specified for striping. - */ int32_t -stripe_single_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, - struct stat *stbuf, struct iobref *iobref) +stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); + int32_t i = 0; + int32_t callcnt = 0; + int32_t count = 0; + stripe_local_t *local = NULL; + struct iovec *vec = NULL; + struct iatt tmp_stbuf = {0,}; + struct iobref *tmp_iobref = NULL; + struct iobuf *iobuf = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret != -1) { + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + op_ret = 0; + + /* Keep extra space for filling in '\0's */ + vec = GF_CALLOC ((local->count * 2), sizeof (struct iovec), + gf_stripe_mt_iovec); + if (!vec) { + op_ret = -1; + goto done; + } + + for (i = 0; i < local->wind_count; i++) { + if (local->replies[i].op_ret) { + memcpy ((vec + count), local->replies[i].vector, + (local->replies[i].count * sizeof (struct iovec))); + count += local->replies[i].count; + op_ret += local->replies[i].op_ret; + } + if ((local->replies[i].op_ret < + local->replies[i].requested_size) && + (local->stbuf_size > (local->offset + op_ret))) { + /* Fill in 0s here */ + vec[count].iov_len = + (local->replies[i].requested_size - + local->replies[i].op_ret); + iobuf = iobuf_get2 (this->ctx->iobuf_pool, + vec[count].iov_len); + if (!iobuf) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory."); + op_ret = -1; + op_errno = ENOMEM; + goto done; + } + memset (iobuf->ptr, 0, vec[count].iov_len); + vec[count].iov_base = iobuf->ptr; + + iobref_add (local->iobref, iobuf); + iobuf_unref(iobuf); + + op_ret += vec[count].iov_len; + count++; + } + GF_FREE (local->replies[i].vector); + } + + /* FIXME: notice that st_ino, and st_dev (gen) will be + * different than what inode will have. Make sure this doesn't + * cause any bugs at higher levels */ + memcpy (&tmp_stbuf, &local->replies[0].stbuf, + sizeof (struct iatt)); + tmp_stbuf.ia_size = local->stbuf_size; + + done: + GF_FREE (local->replies); + tmp_iobref = local->iobref; + STRIPE_STACK_UNWIND (readv, frame, op_ret, op_errno, vec, + count, &tmp_stbuf, tmp_iobref, NULL); + + iobref_unref (tmp_iobref); + GF_FREE (vec); + } +out: return 0; } @@ -2707,116 +3281,155 @@ stripe_single_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * to above layer after putting it in a single vector. */ int32_t -stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) + int32_t count, struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { int32_t index = 0; int32_t callcnt = 0; - call_frame_t *main_frame = NULL; - stripe_local_t *main_local = NULL; - stripe_local_t *local = frame->local; + int32_t final_count = 0; + int32_t need_to_check_proper_size = 0; + call_frame_t *mframe = NULL; + stripe_local_t *mlocal = NULL; + stripe_local_t *local = NULL; + struct iovec *final_vec = NULL; + struct iatt tmp_stbuf = {0,}; + struct iatt *tmp_stbuf_p = NULL; //need it for a warning + struct iobref *tmp_iobref = NULL; + stripe_fd_ctx_t *fctx = NULL; + call_frame_t *prev = NULL; - index = local->node_index; - main_frame = local->orig_frame; - main_local = main_frame->local; + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto end; + } + + local = frame->local; + index = local->node_index; + prev = cookie; + mframe = local->orig_frame; + if (!mframe) + goto out; + + mlocal = mframe->local; + if (!mlocal) + goto out; + + fctx = mlocal->fctx; - LOCK (&main_frame->lock); + LOCK (&mframe->lock); { - main_local->replies[index].op_ret = op_ret; - main_local->replies[index].op_errno = op_errno; + mlocal->replies[index].op_ret = op_ret; + mlocal->replies[index].op_errno = op_errno; + mlocal->replies[index].requested_size = local->readv_size; if (op_ret >= 0) { - main_local->replies[index].stbuf = *stbuf; - main_local->replies[index].count = count; - main_local->replies[index].vector = - iov_dup (vector, count); + mlocal->replies[index].stbuf = *stbuf; + mlocal->replies[index].count = count; + mlocal->replies[index].vector = iov_dup (vector, count); - if (!main_local->iobref) - main_local->iobref = iobref_new (); - iobref_merge (main_local->iobref, iobref); + correct_file_size(stbuf, fctx, prev); + + if (local->stbuf_size < stbuf->ia_size) + local->stbuf_size = stbuf->ia_size; + local->stbuf_blocks += stbuf->ia_blocks; + + if (!mlocal->iobref) + mlocal->iobref = iobref_new (); + iobref_merge (mlocal->iobref, iobref); } - callcnt = ++main_local->call_count; + callcnt = ++mlocal->call_count; } - UNLOCK(&main_frame->lock); - - if (callcnt == main_local->wind_count) { - int32_t final_count = 0; - struct iovec *final_vec = NULL; - struct stat tmp_stbuf = {0,}; - struct iobref *iobref = NULL; + UNLOCK(&mframe->lock); + if (callcnt == mlocal->wind_count) { op_ret = 0; - memcpy (&tmp_stbuf, &main_local->replies[0].stbuf, - sizeof (struct stat)); - for (index=0; index < main_local->wind_count; index++) { - /* TODO: check whether each stripe returned 'expected' - * number of bytes - */ - if (main_local->replies[index].op_ret == -1) { + + for (index=0; index < mlocal->wind_count; index++) { + /* check whether each stripe returned + * 'expected' number of bytes */ + if (mlocal->replies[index].op_ret == -1) { op_ret = -1; - op_errno = main_local->replies[index].op_errno; + op_errno = mlocal->replies[index].op_errno; break; } - op_ret += main_local->replies[index].op_ret; - final_count += main_local->replies[index].count; - /* TODO: Do I need to send anything more in stbuf? */ - if (tmp_stbuf.st_size < - main_local->replies[index].stbuf.st_size) { - tmp_stbuf.st_size = - main_local->replies[index].stbuf.st_size; + /* TODO: handle the 'holes' within the read range + properly */ + if (mlocal->replies[index].op_ret < + mlocal->replies[index].requested_size) { + need_to_check_proper_size = 1; } + + op_ret += mlocal->replies[index].op_ret; + mlocal->count += mlocal->replies[index].count; } - if (op_ret != -1) { - final_vec = CALLOC (final_count, - sizeof (struct iovec)); - if (!final_vec) { - op_ret = -1; - final_count = 0; - goto done; - } + if (op_ret == -1) + goto done; + if (need_to_check_proper_size) + goto check_size; - final_count = 0; + final_vec = GF_CALLOC (mlocal->count, sizeof (struct iovec), + gf_stripe_mt_iovec); - for (index=0; - index < main_local->wind_count; index++) { - memcpy (final_vec + final_count, - main_local->replies[index].vector, - (main_local->replies[index].count * - sizeof (struct iovec))); - final_count += - main_local->replies[index].count; + if (!final_vec) { + op_ret = -1; + goto done; + } - free (main_local->replies[index].vector); - } - } else { - final_vec = NULL; - final_count = 0; + for (index = 0; index < mlocal->wind_count; index++) { + memcpy ((final_vec + final_count), + mlocal->replies[index].vector, + (mlocal->replies[index].count * + sizeof (struct iovec))); + final_count += mlocal->replies[index].count; + GF_FREE (mlocal->replies[index].vector); } + /* FIXME: notice that st_ino, and st_dev (gen) will be + * different than what inode will have. Make sure this doesn't + * cause any bugs at higher levels */ + memcpy (&tmp_stbuf, &mlocal->replies[0].stbuf, + sizeof (struct iatt)); + tmp_stbuf.ia_size = local->stbuf_size; + tmp_stbuf.ia_blocks = local->stbuf_blocks; + done: /* */ - FREE (main_local->replies); - iobref = main_local->iobref; - STACK_UNWIND (main_frame, op_ret, op_errno, - final_vec, final_count, &tmp_stbuf, iobref); + GF_FREE (mlocal->replies); + tmp_iobref = mlocal->iobref; + /* work around for nfs truncated read. Bug 3774 */ + tmp_stbuf_p = &tmp_stbuf; + WIPE (tmp_stbuf_p); + STRIPE_STACK_UNWIND (readv, mframe, op_ret, op_errno, final_vec, + final_count, &tmp_stbuf, tmp_iobref, NULL); + + iobref_unref (tmp_iobref); + GF_FREE (final_vec); + } + + goto out; + +check_size: + mlocal->call_count = fctx->stripe_count; - iobref_unref (iobref); - if (final_vec) - FREE (final_vec); + for (index = 0; index < fctx->stripe_count; index++) { + STACK_WIND (mframe, stripe_readv_fstat_cbk, + (fctx->xl_array[index]), + (fctx->xl_array[index])->fops->fstat, + mlocal->fd, NULL); } - STACK_DESTROY (frame->root); +out: + STRIPE_STACK_DESTROY (frame); +end: return 0; } -/** - * stripe_readv - - */ + int32_t stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) + size_t size, off_t offset, uint32_t flags, dict_t *xdata) { - int32_t op_errno = 1; + int32_t op_errno = EINVAL; int32_t idx = 0; int32_t index = 0; int32_t num_stripe = 0; @@ -2827,17 +3440,18 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, uint64_t stripe_size = 0; off_t rounded_start = 0; off_t frame_offset = offset; + off_t dest_offset = 0; stripe_local_t *local = NULL; call_frame_t *rframe = NULL; stripe_local_t *rlocal = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; stripe_fd_ctx_t *fctx = NULL; - trav = this->children; - priv = this->private; + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); - fd_ctx_get (fd, this, &tmp_fctx); + inode_ctx_get (fd->inode, this, &tmp_fctx); if (!tmp_fctx) { op_errno = EBADFD; goto err; @@ -2845,122 +3459,183 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; stripe_size = fctx->stripe_size; - /* The file is stripe across the child nodes. Send the read request - * to the child nodes appropriately after checking which region of + STRIPE_VALIDATE_FCTX (fctx, err); + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + goto err; + } + /* The file is stripe across the child nodes. Send the read request + * to the child nodes appropriately after checking which region of * the file is in which child node. Always '0-<stripe_size>' part of * the file resides in the first child. */ rounded_start = floor (offset, stripe_size); rounded_end = roof (offset+size, stripe_size); - num_stripe = (rounded_end - rounded_start) / stripe_size; - - local = CALLOC (1, sizeof (stripe_local_t)); + num_stripe = (rounded_end- rounded_start)/stripe_size; + + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } - local->wind_count = num_stripe; frame->local = local; - + /* This is where all the vectors should be copied. */ - local->replies = CALLOC (num_stripe, sizeof (struct readv_replies)); + local->replies = GF_CALLOC (num_stripe, sizeof (struct stripe_replies), + gf_stripe_mt_stripe_replies); if (!local->replies) { op_errno = ENOMEM; goto err; } - + off_index = (offset / stripe_size) % fctx->stripe_count; - + local->wind_count = num_stripe; + local->readv_size = size; + local->offset = offset; + local->fd = fd_ref (fd); + local->fctx = fctx; + for (index = off_index; index < (num_stripe + off_index); index++) { rframe = copy_frame (frame); - rlocal = CALLOC (1, sizeof (stripe_local_t)); + rlocal = mem_get0 (this->local_pool); if (!rlocal) { op_errno = ENOMEM; goto err; } - + frame_size = min (roof (frame_offset+1, stripe_size), (offset + size)) - frame_offset; - + rlocal->node_index = index - off_index; rlocal->orig_frame = frame; + rlocal->readv_size = frame_size; rframe->local = rlocal; idx = (index % fctx->stripe_count); + + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(frame_offset, + stripe_size, fctx->stripe_count); + else + dest_offset = frame_offset; + STACK_WIND (rframe, stripe_readv_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->readv, - fd, frame_size, frame_offset); - + fd, frame_size, dest_offset, flags, xdata); + frame_offset += frame_size; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + if (rframe) + STRIPE_STACK_DESTROY (rframe); + + STRIPE_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); return 0; } -/** - * stripe_writev_cbk - - */ int32_t stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + struct stripe_replies *reply = NULL; + int32_t i = 0; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + prev = cookie; local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; LOCK(&frame->lock); { - callcnt = ++local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->op_errno = op_errno; - local->op_ret = -1; - } + callcnt = ++mlocal->call_count; + + mlocal->replies[local->node_index].op_ret = op_ret; + mlocal->replies[local->node_index].op_errno = op_errno; + if (op_ret >= 0) { - local->op_ret += op_ret; - local->stbuf = *stbuf; + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; } } UNLOCK (&frame->lock); - if ((callcnt == local->wind_count) && local->unwind) { - STACK_UNWIND (frame, local->op_ret, - local->op_errno, &local->stbuf); + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + /* + * Only return the number of consecutively written bytes up until + * the first error. Only return an error if it occurs first. + * + * When a short write occurs, the application should retry at the + * appropriate offset, at which point we'll potentially pass back + * the error. + */ + for (i = 0, reply = mlocal->replies; i < mlocal->wind_count; + i++, reply++) { + if (reply->op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, "reply %d " + "returned error %s", i, + strerror(reply->op_errno)); + if (!mlocal->op_ret) { + mlocal->op_ret = -1; + mlocal->op_errno = reply->op_errno; + } + break; + } + + mlocal->op_ret += reply->op_ret; + + if (reply->op_ret < reply->requested_size) + break; + } + + GF_FREE(mlocal->replies); + + STRIPE_STACK_UNWIND (writev, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); } +out: + STRIPE_STACK_DESTROY(frame); return 0; } - -/** - * stripe_single_writev_cbk - - */ -int32_t -stripe_single_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, stbuf); - return 0; -} -/** - * stripe_writev - - */ int32_t stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + uint32_t flags, struct iobref *iobref, dict_t *xdata) { - struct iovec *tmp_vec = vector; - stripe_private_t *priv = NULL; + struct iovec *tmp_vec = NULL; stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; int32_t idx = 0; @@ -2971,10 +3646,19 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t fill_size = 0; uint64_t stripe_size = 0; uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + off_t rounded_start = 0; + off_t rounded_end = 0; + int32_t total_chunks = 0; + call_frame_t *wframe = NULL; + stripe_local_t *wlocal = NULL; - priv = this->private; + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); - fd_ctx_get (fd, this, &tmp_fctx); + inode_ctx_get (fd->inode, this, &tmp_fctx); if (!tmp_fctx) { op_errno = EINVAL; goto err; @@ -2982,29 +3666,57 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; stripe_size = fctx->stripe_size; + STRIPE_VALIDATE_FCTX (fctx, err); + /* File has to be stripped across the child nodes */ for (idx = 0; idx< count; idx ++) { - total_size += tmp_vec[idx].iov_len; + total_size += vector[idx].iov_len; } remaining_size = total_size; - local = CALLOC (1, sizeof (stripe_local_t)); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } frame->local = local; local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + rounded_start = floor(offset, stripe_size); + rounded_end = roof(offset + total_size, stripe_size); + total_chunks = (rounded_end - rounded_start) / stripe_size; + local->replies = GF_CALLOC(total_chunks, sizeof(struct stripe_replies), + gf_stripe_mt_stripe_replies); + if (!local->replies) { + op_errno = ENOMEM; + goto err; + } + + total_chunks = 0; while (1) { - /* Send striped chunk of the vector to child + wframe = copy_frame(frame); + wlocal = mem_get0(this->local_pool); + if (!wlocal) { + op_errno = ENOMEM; + goto err; + } + wlocal->orig_frame = frame; + wframe->local = wlocal; + + /* Send striped chunk of the vector to child nodes appropriately. */ - trav = this->children; - - idx = (((offset + offset_offset) / + idx = (((offset + offset_offset) / local->stripe_size) % fctx->stripe_count); - fill_size = (local->stripe_size - + fill_size = (local->stripe_size - ((offset + offset_offset) % local->stripe_size)); if (fill_size > remaining_size) fill_size = remaining_size; @@ -3013,158 +3725,597 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, tmp_count = iov_subset (vector, count, offset_offset, offset_offset + fill_size, NULL); - tmp_vec = CALLOC (tmp_count, sizeof (struct iovec)); + tmp_vec = GF_CALLOC (tmp_count, sizeof (struct iovec), + gf_stripe_mt_iovec); if (!tmp_vec) { op_errno = ENOMEM; goto err; } tmp_count = iov_subset (vector, count, offset_offset, offset_offset + fill_size, tmp_vec); - + local->wind_count++; if (remaining_size == 0) local->unwind = 1; - STACK_WIND(frame, stripe_writev_cbk, fctx->xl_array[idx], - fctx->xl_array[idx]->fops->writev, fd, tmp_vec, - tmp_count, offset + offset_offset, iobref); - FREE (tmp_vec); + /* + * Store off the request index (with respect to the chunk of the + * initial offset) and the size of the request. This is required + * in the callback to calculate an appropriate return value in + * the event of a write failure in one or more requests. + */ + wlocal->node_index = total_chunks; + local->replies[total_chunks].requested_size = fill_size; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + STACK_WIND (wframe, stripe_writev_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->writev, fd, tmp_vec, + tmp_count, dest_offset, flags, iobref, + xdata); + + GF_FREE (tmp_vec); offset_offset += fill_size; + total_chunks++; if (remaining_size == 0) break; } return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + if (wframe) + STRIPE_STACK_DESTROY(wframe); + + STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } +int32_t +stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } -/* Management operations */ + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} -/** - * stripe_stats_cbk - Add all the fields received from different clients. - * Once all the clients return, send stats to above layer. - * - */ int32_t -stripe_stats_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct xlator_stats *stats) +stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send fallocate request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single fallocate per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->fallocate, fd, mode, + dest_offset, fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int32_t +stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; LOCK(&frame->lock); { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - ((call_frame_t *)cookie)->this->name, - strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; - } + callcnt = ++mlocal->call_count; + if (op_ret == 0) { - if (local->op_ret == -2) { - /* This is to make sure this is the - first time */ - local->stats = *stats; - local->op_ret = 0; - } else { - local->stats.nr_files += stats->nr_files; - local->stats.free_disk += stats->free_disk; - local->stats.disk_usage += stats->disk_usage; - local->stats.nr_clients += stats->nr_clients; - } + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } } UNLOCK (&frame->lock); - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stats); - } + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); return 0; } -/** - * stripe_stats - - */ int32_t -stripe_stats (call_frame_t *frame, xlator_t *this, int32_t flags) +stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; - priv = this->private; - trav = this->children; + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); - local = CALLOC (1, sizeof (stripe_local_t)); + remaining_size = len; + + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } frame->local = local; - local->op_ret = -2; /* to be used as a flag in _cbk */ - local->call_count = priv->child_count; + local->stripe_size = stripe_size; + local->fctx = fctx; - while (trav) { - STACK_WIND (frame, stripe_stats_cbk, trav->xlator, - trav->xlator->mops->stats, flags); - trav = trav->next; + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send discard request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single discard per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->discard, fd, dest_offset, + fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + GF_ASSERT (frame); + + if (!this || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, + fctx->stripe_count); + + STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->zerofill, fd, + dest_offset, fill_size, xdata); + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t stripe_release (xlator_t *this, fd_t *fd) { + return 0; +} + +int +stripe_forget (xlator_t *this, inode_t *inode) +{ uint64_t tmp_fctx = 0; stripe_fd_ctx_t *fctx = NULL; - fd_ctx_del (fd, this, &tmp_fctx); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (inode, err); + + (void) inode_ctx_del (inode, this, &tmp_fctx); if (!tmp_fctx) { - goto out; + goto err; } - + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; if (!fctx->static_array) - FREE (fctx->xl_array); - - FREE (fctx); - - out: - return 0; + GF_FREE (fctx->xl_array); + + GF_FREE (fctx); +err: + return 0; } -/** - * notify - */ int32_t notify (xlator_t *this, int32_t event, void *data, ...) { stripe_private_t *priv = NULL; int down_client = 0; int i = 0; + gf_boolean_t heard_from_all_children = _gf_false; + + if (!this) + return 0; priv = this->private; if (!priv) return 0; - switch (event) + switch (event) { case GF_EVENT_CHILD_UP: { @@ -3173,24 +4324,28 @@ notify (xlator_t *this, int32_t event, void *data, ...) if (data == priv->xl_array[i]) break; } - priv->state[i] = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; + + if (priv->child_count == i) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_UP bad subvolume %s", + data? ((xlator_t *)data)->name: NULL); + break; } LOCK (&priv->lock); { - priv->nodes_down = down_client; - - if (data == FIRST_CHILD (this)) { + if (data == FIRST_CHILD (this)) priv->first_child_down = 0; - default_notify (this, event, data); - } + priv->last_event[i] = event; } UNLOCK (&priv->lock); } break; + case GF_EVENT_CHILD_CONNECTING: + { + // 'CONNECTING' doesn't ensure its CHILD_UP, so do nothing + goto out; + } case GF_EVENT_CHILD_DOWN: { /* get an index number to set */ @@ -3198,20 +4353,19 @@ notify (xlator_t *this, int32_t event, void *data, ...) if (data == priv->xl_array[i]) break; } - priv->state[i] = 0; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; + + if (priv->child_count == i) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_DOWN bad subvolume %s", + data? ((xlator_t *)data)->name: NULL); + break; } LOCK (&priv->lock); { - priv->nodes_down = down_client; - - if (data == FIRST_CHILD (this)) { + if (data == FIRST_CHILD (this)) priv->first_child_down = 1; - default_notify (this, event, data); - } + priv->last_event[i] = event; } UNLOCK (&priv->lock); } @@ -3221,82 +4375,751 @@ notify (xlator_t *this, int32_t event, void *data, ...) { /* */ default_notify (this, event, data); + goto out; } break; } + // Consider child as down if it's last_event is not CHILD_UP + for (i = 0, down_client = 0; i < priv->child_count; i++) + if (priv->last_event[i] != GF_EVENT_CHILD_UP) + down_client++; + + LOCK (&priv->lock); + { + priv->nodes_down = down_client; + } + UNLOCK (&priv->lock); + + heard_from_all_children = _gf_true; + for (i = 0; i < priv->child_count; i++) + if (!priv->last_event[i]) + heard_from_all_children = _gf_false; + + if (heard_from_all_children) + default_notify (this, event, data); +out: return 0; } int -set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data) +stripe_setxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { - int ret = -1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *dup_str = NULL; - char *stripe_str = NULL; - char *pattern = NULL; - char *num = NULL; - struct stripe_options *temp_stripeopt = NULL; - struct stripe_options *stripe_opt = NULL; - - /* Get the pattern for striping. - "option block-size *avi:10MB" etc */ - stripe_str = strtok_r (data, ",", &tmp_str); - while (stripe_str) { - dup_str = strdup (stripe_str); - stripe_opt = CALLOC (1, sizeof (struct stripe_options)); - if (!stripe_opt) - goto out; + int ret = -1; + int call_cnt = 0; + stripe_local_t *local = NULL; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "Possible NULL deref"); + return ret; + } + + local = frame->local; + + LOCK (&frame->lock); + { + call_cnt = --local->wind_count; + + /** + * We overwrite ->op_* values here for subsequent faliure + * conditions, hence we propogate the last errno down the + * stack. + */ + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unlock; + } + } + + unlock: + UNLOCK (&frame->lock); + + if (!call_cnt) { + STRIPE_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, xdata); + } + + return 0; +} + +#ifdef HAVE_BD_XLATOR +int +stripe_is_bd (dict_t *this, char *key, data_t *value, void *data) +{ + gf_boolean_t *is_bd = data; + + if (data == NULL) + return 0; + + if (XATTR_IS_BD (key)) + *is_bd = _gf_true; + + return 0; +} + +static inline gf_boolean_t +stripe_setxattr_is_bd (dict_t *dict) +{ + gf_boolean_t is_bd = _gf_false; + + if (dict == NULL) + goto out; + + dict_foreach (dict, stripe_is_bd, &is_bd); +out: + return is_bd; +} +#else +#define stripe_setxattr_is_bd(dict) _gf_false +#endif + +int +stripe_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + stripe_local_t *local = NULL; + int i = 0; + gf_boolean_t is_bd = _gf_false; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.*stripe*", dict, + op_errno, err); + + priv = this->private; + trav = this->children; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + frame->local = local; + local->wind_count = priv->child_count; + local->op_ret = local->op_errno = 0; + + is_bd = stripe_setxattr_is_bd (dict); + + /** + * Set xattrs for directories on all subvolumes. Additionally + * this power is only given to a special client. Bd xlator + * also needs xattrs for regular files (ie LVs) + */ + if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) && + IA_ISDIR (loc->inode->ia_type)) || is_bd) { + for (i = 0; i < priv->child_count; i++, trav = trav->next) { + STACK_WIND (frame, stripe_setxattr_cbk, + trav->xlator, trav->xlator->fops->setxattr, + loc, dict, flags, xdata); + } + } else { + local->wind_count = 1; + STACK_WIND (frame, stripe_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + } + + return 0; +err: + STRIPE_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + return 0; +} + + +int +stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +stripe_is_special_key (dict_t *this, + char *key, + data_t *value, + void *data) +{ + gf_boolean_t *is_special = NULL; + + if (data == NULL) { + goto out; + } + + is_special = data; + + if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key)) + *is_special = _gf_true; + +out: + return 0; +} + +int32_t +stripe_fsetxattr_everyone_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + int call_count = 0; + stripe_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->wind_count; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + STRIPE_STACK_UNWIND (fsetxattr, frame, local->op_ret, + local->op_errno, NULL); + } + return 0; +} + +int +stripe_fsetxattr_to_everyone (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) +{ + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + int ret = -1; + stripe_local_t *local = NULL; + + priv = this->private; + + local = mem_get0 (this->local_pool); + if (local == NULL) { + goto out; + } + + frame->local = local; + + local->wind_count = priv->child_count; + + trav = this->children; + + while (trav) { + STACK_WIND (frame, stripe_fsetxattr_everyone_cbk, + trav->xlator, trav->xlator->fops->fsetxattr, + fd, dict, flags, xdata); + trav = trav->next; + } + + ret = 0; +out: + return ret; +} + +static inline gf_boolean_t +stripe_fsetxattr_is_special (dict_t *dict) +{ + gf_boolean_t is_spl = _gf_false; + + if (dict == NULL) { + goto out; + } + + dict_foreach (dict, stripe_is_special_key, &is_spl); + +out: + return is_spl; +} + +int +stripe_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) +{ + int32_t op_ret = -1, ret = -1, op_errno = EINVAL; + gf_boolean_t is_spl = _gf_false; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.*stripe*", dict, + op_errno, err); + + is_spl = stripe_fsetxattr_is_special (dict); + if (is_spl) { + ret = stripe_fsetxattr_to_everyone (frame, this, fd, dict, + flags, xdata); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } + + goto out; + } + + STACK_WIND (frame, stripe_fsetxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); +out: + return 0; +err: + STRIPE_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int +stripe_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int +stripe_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + + VALIDATE_OR_GOTO (this, err); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.*stripe*", + name, op_errno, err); + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (loc, err); + + STACK_WIND (frame, stripe_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, name, xdata); + return 0; +err: + STRIPE_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + return 0; +} + + +int +stripe_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int +stripe_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.*stripe*", + name, op_errno, err); + + STACK_WIND (frame, stripe_fremovexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, + fd, name, xdata); + return 0; + err: + STRIPE_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +stripe_readdirp_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *parent) +{ + stripe_local_t *local = NULL; + call_frame_t *main_frame = NULL; + stripe_local_t *main_local = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + int done = 0; + + local = frame->local; + prev = cookie; + + entry = local->dirent; + + main_frame = local->orig_frame; + main_local = main_frame->local; + LOCK (&frame->lock); + { + + local->call_count--; + if (!local->call_count) + done = 1; + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = op_ret; + goto unlock; + } - pattern = strtok_r (dup_str, ":", &tmp_str1); - num = strtok_r (NULL, ":", &tmp_str1); - if (!num) { - num = pattern; - pattern = "*"; + if (stripe_ctx_handle(this, prev, local, xattr)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from dict."); + + correct_file_size(stbuf, local->fctx, prev); + + stripe_iatt_merge (stbuf, &entry->d_stat); + local->stbuf_blocks += stbuf->ia_blocks; + } +unlock: + UNLOCK(&frame->lock); + + if (done) { + inode_ctx_put (entry->inode, this, + (uint64_t) (long)local->fctx); + + done = 0; + LOCK (&main_frame->lock); + { + main_local->wind_count--; + if (!main_local->wind_count) + done = 1; + if (local->op_ret == -1) { + main_local->op_errno = local->op_errno; + main_local->op_ret = local->op_ret; + } + entry->d_stat.ia_blocks = local->stbuf_blocks; + } + UNLOCK (&main_frame->lock); + if (done) { + main_frame->local = NULL; + STRIPE_STACK_UNWIND (readdir, main_frame, + main_local->op_ret, + main_local->op_errno, + &main_local->entries, NULL); + gf_dirent_free (&main_local->entries); + stripe_local_wipe (main_local); + mem_put (main_local); } - if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", num); + frame->local = NULL; + stripe_local_wipe (local); + mem_put (local); + STRIPE_STACK_DESTROY (frame); + } + + return 0; +} + +int32_t +stripe_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *orig_entries, dict_t *xdata) +{ + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + gf_dirent_t *local_entry = NULL; + gf_dirent_t *tmp_entry = NULL; + xlator_list_t *trav = NULL; + loc_t loc = {0, }; + int32_t count = 0; + stripe_private_t *priv = NULL; + int32_t subvols = 0; + dict_t *xattrs = NULL; + call_frame_t *local_frame = NULL; + stripe_local_t *local_ent = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + prev = cookie; + local = frame->local; + trav = this->children; + priv = this->private; + + subvols = priv->child_count; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + local->op_ret = op_ret; + goto unlock; + } else { + local->op_ret = op_ret; + list_splice_init (&orig_entries->list, + &local->entries.list); + local->wind_count = op_ret; + } + + } +unlock: + UNLOCK (&frame->lock); + + if (op_ret == -1) + goto out; + + xattrs = dict_new (); + if (xattrs) + (void) stripe_xattr_request_build (this, xattrs, 0, 0, 0, 0); + count = op_ret; + list_for_each_entry_safe (local_entry, tmp_entry, + (&local->entries.list), list) { + + if (!local_entry) + break; + if (!IA_ISREG (local_entry->d_stat.ia_type) || !local_entry->inode) { + LOCK (&frame->lock); + { + local->wind_count--; + count = local->wind_count; + } + UNLOCK (&frame->lock); + continue; + } + + local_frame = copy_frame (frame); + + if (!local_frame) { + op_errno = ENOMEM; + op_ret = -1; goto out; - } - memcpy (stripe_opt->path_pattern, pattern, strlen (pattern)); - - gf_log (this->name, GF_LOG_DEBUG, - "block-size : pattern %s : size %"PRId64, - stripe_opt->path_pattern, stripe_opt->block_size); - - if (!priv->pattern) { - priv->pattern = stripe_opt; + } + + local_ent = mem_get0 (this->local_pool); + if (!local_ent) { + op_errno = ENOMEM; + op_ret = -1; + goto out; + } + + loc.inode = inode_ref (local_entry->inode); + + uuid_copy (loc.gfid, local_entry->d_stat.ia_gfid); + + local_ent->orig_frame = frame; + + local_ent->call_count = subvols; + + local_ent->dirent = local_entry; + + local_frame->local = local_ent; + + trav = this->children; + while (trav) { + STACK_WIND (local_frame, stripe_readdirp_lookup_cbk, + trav->xlator, trav->xlator->fops->lookup, + &loc, xattrs); + trav = trav->next; + } + loc_wipe (&loc); + } +out: + if (!count) { + /* all entries are directories */ + frame->local = NULL; + STRIPE_STACK_UNWIND (readdir, frame, local->op_ret, + local->op_errno, &local->entries, NULL); + gf_dirent_free (&local->entries); + stripe_local_wipe (local); + mem_put (local); + } + if (xattrs) + dict_unref (xattrs); + return 0; + +} +int32_t +stripe_readdirp (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = NULL; + xlator_list_t *trav = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + trav = this->children; + + if (priv->first_child_down) { + op_errno = ENOTCONN; + goto err; + } + + /* Initialization */ + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + frame->local = local; + + local->fd = fd_ref (fd); + + local->wind_count = 0; + + local->count = 0; + local->op_ret = -1; + INIT_LIST_HEAD(&local->entries); + + if (!trav) + goto err; + + STACK_WIND (frame, stripe_readdirp_cbk, trav->xlator, + trav->xlator->fops->readdirp, fd, size, off, xdata); + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + STRIPE_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); + + return 0; + +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + goto out; + + ret = xlator_mem_acct_init (this, gf_stripe_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + goto out; + } + +out: + return ret; +} + +static int +clear_pattern_list (stripe_private_t *priv) +{ + struct stripe_options *prev = NULL; + struct stripe_options *trav = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("stripe", priv, out); + + trav = priv->pattern; + priv->pattern = NULL; + while (trav) { + prev = trav; + trav = trav->next; + GF_FREE (prev); + } + + ret = 0; + out: + return ret; + + +} + + +int +reconfigure (xlator_t *this, dict_t *options) +{ + + stripe_private_t *priv = NULL; + data_t *data = NULL; + int ret = -1; + volume_option_t *opt = NULL; + + GF_ASSERT (this); + GF_ASSERT (this->private); + + priv = this->private; + + + ret = 0; + LOCK (&priv->lock); + { + ret = clear_pattern_list (priv); + if (ret) + goto unlock; + + data = dict_get (options, "block-size"); + if (data) { + ret = set_stripe_block_size (this, priv, data->data); + if (ret) + goto unlock; } else { - temp_stripeopt = priv->pattern; - while (temp_stripeopt->next) - temp_stripeopt = temp_stripeopt->next; - temp_stripeopt->next = stripe_opt; + opt = xlator_volume_option_get (this, "block-size"); + if (!opt) { + gf_log (this->name, GF_LOG_WARNING, + "option 'block-size' not found"); + ret = -1; + goto unlock; + } + + if (gf_string2bytesize_uint64 (opt->default_value, &priv->block_size)){ + gf_log (this->name, GF_LOG_ERROR, + "Unable to set default block-size "); + ret = -1; + goto unlock; + } } - stripe_str = strtok_r (NULL, ",", &tmp_str); + + GF_OPTION_RECONF("coalesce", priv->coalesce, options, bool, + unlock); } + unlock: + UNLOCK (&priv->lock); + if (ret) + goto out; ret = 0; out: return ret; + } /** - * init - This function is called when xlator-graph gets initialized. + * init - This function is called when xlator-graph gets initialized. * The option given in volfiles are parsed here. - * @this - + * @this - */ int32_t init (xlator_t *this) { stripe_private_t *priv = NULL; + volume_option_t *opt = NULL; xlator_list_t *trav = NULL; data_t *data = NULL; int32_t count = 0; int ret = -1; + if (!this) + goto out; + trav = this->children; while (trav) { count++; @@ -3314,16 +5137,27 @@ init (xlator_t *this) gf_log (this->name, GF_LOG_WARNING, "dangling volume. check volfile "); } - - priv = CALLOC (1, sizeof (stripe_private_t)); + + if (count == 1) { + gf_log (this->name, GF_LOG_ERROR, + "stripe configured with only one \"subvolumes\" option." + " please check the volume. exiting"); + goto out; + } + + priv = GF_CALLOC (1, sizeof (stripe_private_t), + gf_stripe_mt_stripe_private_t); + if (!priv) goto out; - priv->xl_array = CALLOC (count, sizeof (xlator_t *)); + priv->xl_array = GF_CALLOC (count, sizeof (xlator_t *), + gf_stripe_mt_xlator_t); if (!priv->xl_array) goto out; - priv->state = CALLOC (count, sizeof (int8_t)); - if (!priv->xl_array) + priv->last_event = GF_CALLOC (count, sizeof (int), + gf_stripe_mt_int32_t); + if (!priv->last_event) goto out; priv->child_count = count; @@ -3343,128 +5177,614 @@ init (xlator_t *this) goto out; } - priv->block_size = (128 * GF_UNIT_KB); - /* option stripe-pattern *avi:1GB,*pdf:4096 */ - data = dict_get (this->options, "block-size"); - if (!data) { - gf_log (this->name, GF_LOG_DEBUG, - "No \"option block-size <x>\" given, defaulting " - "to 128KB"); - } else { - ret = set_stripe_block_size (this, priv, data->data); - if (ret) - goto out; - } - - priv->xattr_supported = 1; - data = dict_get (this->options, "use-xattr"); - if (data) { - if (gf_string2boolean (data->data, - &priv->xattr_supported) == -1) { + ret = 0; + LOCK (&priv->lock); + { + opt = xlator_volume_option_get (this, "block-size"); + if (!opt) { + gf_log (this->name, GF_LOG_WARNING, + "option 'block-size' not found"); + ret = -1; + goto unlock; + } + if (gf_string2bytesize_uint64 (opt->default_value, &priv->block_size)){ gf_log (this->name, GF_LOG_ERROR, - "error setting hard check for extended " - "attribute"); - //return -1; + "Unable to set default block-size "); + ret = -1; + goto unlock; + } + /* option stripe-pattern *avi:1GB,*pdf:16K */ + data = dict_get (this->options, "block-size"); + if (data) { + ret = set_stripe_block_size (this, priv, data->data); + if (ret) + goto unlock; } } + unlock: + UNLOCK (&priv->lock); + if (ret) + goto out; + GF_OPTION_INIT ("use-xattr", priv->xattr_supported, bool, out); /* notify related */ priv->nodes_down = priv->child_count; + + GF_OPTION_INIT("coalesce", priv->coalesce, bool, out); + + this->local_pool = mem_pool_new (stripe_local_t, 128); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + this->private = priv; ret = 0; - out: +out: if (ret) { if (priv) { - if (priv->xl_array) - FREE (priv->xl_array); - FREE (priv); + GF_FREE (priv->xl_array); + GF_FREE (priv); } } return ret; -} +} -/** +/** * fini - Free all the private variables - * @this - + * @this - */ -void +void fini (xlator_t *this) { stripe_private_t *priv = NULL; struct stripe_options *prev = NULL; struct stripe_options *trav = NULL; + if (!this) + goto out; + priv = this->private; if (priv) { - if (priv->xl_array) - FREE (priv->xl_array); + this->private = NULL; + GF_FREE (priv->xl_array); trav = priv->pattern; while (trav) { prev = trav; trav = trav->next; - FREE (prev); + GF_FREE (prev); } + GF_FREE (priv->last_event); LOCK_DESTROY (&priv->lock); - FREE (priv); + GF_FREE (priv); } +out: return; } +int32_t +stripe_getxattr_unwind (call_frame_t *frame, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) + +{ + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int +stripe_internal_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ -struct xlator_fops fops = { - .stat = stripe_stat, - .unlink = stripe_unlink, - .symlink = stripe_symlink, - .rename = stripe_rename, - .link = stripe_link, - .chmod = stripe_chmod, - .chown = stripe_chown, - .truncate = stripe_truncate, - .utimens = stripe_utimens, - .create = stripe_create, - .open = stripe_open, - .readv = stripe_readv, - .writev = stripe_writev, - .statfs = stripe_statfs, - .flush = stripe_flush, - .fsync = stripe_fsync, - .setxattr = stripe_setxattr, - .getxattr = stripe_getxattr, - .removexattr = stripe_removexattr, - .access = stripe_access, - .ftruncate = stripe_ftruncate, - .fstat = stripe_fstat, - .readlink = stripe_readlink, - .mkdir = stripe_mkdir, - .rmdir = stripe_rmdir, - .lk = stripe_lk, - .opendir = stripe_opendir, - .fsyncdir = stripe_fsyncdir, - .fchmod = stripe_fchmod, - .fchown = stripe_fchown, - .lookup = stripe_lookup, - .setdents = stripe_setdents, - .mknod = stripe_mknod, -}; + char size_key[256] = {0,}; + char index_key[256] = {0,}; + char count_key[256] = {0,}; + char coalesce_key[256] = {0,}; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); -struct xlator_mops mops = { - .stats = stripe_stats, + if (!xattr || (op_ret == -1)) + goto out; + + sprintf (size_key, "trusted.%s.stripe-size", this->name); + sprintf (count_key, "trusted.%s.stripe-count", this->name); + sprintf (index_key, "trusted.%s.stripe-index", this->name); + sprintf (coalesce_key, "trusted.%s.stripe-coalesce", this->name); + + dict_del (xattr, size_key); + dict_del (xattr, count_key); + dict_del (xattr, index_key); + dict_del (xattr, coalesce_key); + +out: + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + + return 0; + +} + +int +stripe_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + int call_cnt = 0; + stripe_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); + + local = frame->local; + + LOCK (&frame->lock); + { + call_cnt = --local->wind_count; + } + UNLOCK (&frame->lock); + + if (!xattr || (op_ret < 0)) + goto out; + + local->op_ret = 0; + + if (!local->xattr) { + local->xattr = dict_ref (xattr); + } else { + stripe_aggregate_xattr (local->xattr, xattr); + } + +out: + if (!call_cnt) { + STRIPE_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, xdata); + } + + return 0; +} + +int32_t +stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + stripe_local_t *local = NULL; + int32_t callcnt = 0; + int32_t ret = -1; + long cky = 0; + void *xattr_val = NULL; + void *xattr_serz = NULL; + stripe_xattr_sort_t *xattr = NULL; + dict_t *stripe_xattr = NULL; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "Possible NULL deref"); + return ret; + } + + local = frame->local; + cky = (long) cookie; + + if (local->xsel[0] == '\0') { + gf_log (this->name, GF_LOG_ERROR, "Empty xattr in cbk"); + return ret; + } + + LOCK (&frame->lock); + { + callcnt = --local->wind_count; + + if (!dict || (op_ret < 0)) + goto out; + + if (!local->xattr_list) + local->xattr_list = (stripe_xattr_sort_t *) + GF_CALLOC (local->nallocs, + sizeof (stripe_xattr_sort_t), + gf_stripe_mt_xattr_sort_t); + + if (local->xattr_list) { + xattr = local->xattr_list + (int32_t) cky; + + ret = dict_get_ptr_and_len (dict, local->xsel, + &xattr_val, + &xattr->xattr_len); + if (xattr->xattr_len == 0) + goto out; + + xattr->pos = cky; + xattr->xattr_value = gf_memdup (xattr_val, + xattr->xattr_len); + + if (xattr->xattr_value != NULL) + local->xattr_total_len += xattr->xattr_len + 1; + } + } + out: + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->xattr_total_len) + goto unwind; + + stripe_xattr = dict_new (); + if (!stripe_xattr) + goto unwind; + + /* select filler based on ->xsel */ + if (XATTR_IS_PATHINFO (local->xsel)) + ret = stripe_fill_pathinfo_xattr (this, local, + (char **)&xattr_serz); + else if (XATTR_IS_LOCKINFO (local->xsel)) { + ret = stripe_fill_lockinfo_xattr (this, local, + &xattr_serz); + } else { + gf_log (this->name, GF_LOG_WARNING, + "Unknown xattr in xattr request"); + goto unwind; + } + + if (!ret) { + ret = dict_set_dynptr (stripe_xattr, local->xsel, + xattr_serz, + local->xattr_total_len); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Can't set %s key in dict", + local->xsel); + } + + unwind: + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, + stripe_xattr, NULL); + + ret = stripe_free_xattr_str (local); + + GF_FREE (local->xattr_list); + + if (stripe_xattr) + dict_unref (stripe_xattr); + } + + return ret; +} + +int32_t +stripe_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + int32_t op_errno = EINVAL; + int i = 0; + xlator_t **sub_volumes; + int ret = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + priv = this->private; + trav = this->children; + + /* Initialization */ + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->op_ret = -1; + frame->local = local; + loc_copy (&local->loc, loc); + + + if (name && (strcmp (GF_XATTR_MARKER_KEY, name) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + local->marker.call_count = priv->child_count; + + sub_volumes = alloca ( priv->child_count * + sizeof (xlator_t *)); + for (i = 0, trav = this->children; trav ; + trav = trav->next, i++) { + + *(sub_volumes + i) = trav->xlator; + + } + + if (cluster_getmarkerattr (frame, this, loc, name, + local, stripe_getxattr_unwind, + sub_volumes, priv->child_count, + MARKER_UUID_TYPE, marker_uuid_default_gauge, + priv->vol_uuid)) { + op_errno = EINVAL; + goto err; + } + + return 0; + } + + if (name && strncmp (name, GF_XATTR_QUOTA_SIZE_KEY, + strlen (GF_XATTR_QUOTA_SIZE_KEY)) == 0) { + local->wind_count = priv->child_count; + + for (i = 0, trav=this->children; i < priv->child_count; i++, + trav = trav->next) { + STACK_WIND (frame, stripe_getxattr_cbk, + trav->xlator, trav->xlator->fops->getxattr, + loc, name, xdata); + } + + return 0; + } + + if (name && (XATTR_IS_PATHINFO (name))) { + if (IA_ISREG (loc->inode->ia_type)) { + ret = inode_ctx_get (loc->inode, this, + (uint64_t *) &local->fctx); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "stripe size unavailable from fctx" + " relying on pathinfo could lead to" + " wrong results"); + } + + local->nallocs = local->wind_count = priv->child_count; + (void) strncpy (local->xsel, name, strlen (name)); + + /** + * for xattrs that need info from all childs, fill ->xsel + * as above and call the filler function in cbk based on + * it + */ + for (i = 0, trav = this->children; i < priv->child_count; i++, + trav = trav->next) { + STACK_WIND_COOKIE (frame, stripe_vgetxattr_cbk, + (void *) (long) i, trav->xlator, + trav->xlator->fops->getxattr, + loc, name, xdata); + } + + return 0; + } + + if (name &&(*priv->vol_uuid)) { + if ((match_uuid_local (name, priv->vol_uuid) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + + if (!IA_FILE_OR_DIR (loc->inode->ia_type)) + local->marker.call_count = 1; + else + local->marker.call_count = priv->child_count; + + sub_volumes = alloca (local->marker.call_count * + sizeof (xlator_t *)); + + for (i = 0, trav = this->children; + i < local->marker.call_count; + i++, trav = trav->next) { + *(sub_volumes + i) = trav->xlator; + + } + + if (cluster_getmarkerattr (frame, this, loc, name, + local, + stripe_getxattr_unwind, + sub_volumes, + local->marker.call_count, + MARKER_XTIME_TYPE, + marker_xtime_default_gauge, + priv->vol_uuid)) { + op_errno = EINVAL; + goto err; + } + + return 0; + } + } + + + STACK_WIND (frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + + return 0; + +err: + STRIPE_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} + +static inline gf_boolean_t +stripe_is_special_xattr (const char *name) +{ + gf_boolean_t is_spl = _gf_false; + + if (!name) { + goto out; + } + + if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY)) + || XATTR_IS_PATHINFO (name)) + is_spl = _gf_true; +out: + return is_spl; +} + +int32_t +stripe_fgetxattr_from_everyone (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = NULL; + int32_t ret = -1, op_errno = 0; + int i = 0; + xlator_list_t *trav = NULL; + + priv = this->private; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->op_ret = -1; + frame->local = local; + + strncpy (local->xsel, name, strlen (name)); + local->nallocs = local->wind_count = priv->child_count; + + for (i = 0, trav = this->children; i < priv->child_count; i++, + trav = trav->next) { + STACK_WIND_COOKIE (frame, stripe_vgetxattr_cbk, + (void *) (long) i, trav->xlator, + trav->xlator->fops->fgetxattr, + fd, name, xdata); + } + + return 0; + +err: + STACK_UNWIND_STRICT (fgetxattr, frame, -1, op_errno, NULL, NULL); + return ret; +} + +int32_t +stripe_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + if (stripe_is_special_xattr (name)) { + stripe_fgetxattr_from_everyone (frame, this, fd, name, xdata); + goto out; + } + + STACK_WIND (frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + +out: + return 0; +} + + + +int32_t +stripe_priv_dump (xlator_t *this) +{ + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + stripe_private_t *priv = NULL; + int ret = -1; + struct stripe_options *options = NULL; + + GF_VALIDATE_OR_GOTO ("stripe", this, out); + + priv = this->private; + if (!priv) + goto out; + + ret = TRY_LOCK (&priv->lock); + if (ret != 0) + goto out; + + gf_proc_dump_add_section("xlator.cluster.stripe.%s.priv", this->name); + gf_proc_dump_write("child_count","%d", priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + sprintf (key, "subvolumes[%d]", i); + gf_proc_dump_write (key, "%s.%s", priv->xl_array[i]->type, + priv->xl_array[i]->name); + } + + options = priv->pattern; + while (options != NULL) { + gf_proc_dump_write ("path_pattern", "%s", priv->pattern->path_pattern); + gf_proc_dump_write ("options_block_size", "%ul", options->block_size); + + options = options->next; + } + + gf_proc_dump_write ("block_size", "%ul", priv->block_size); + gf_proc_dump_write ("nodes-down", "%d", priv->nodes_down); + gf_proc_dump_write ("first-child_down", "%d", priv->first_child_down); + gf_proc_dump_write ("xattr_supported", "%d", priv->xattr_supported); + + UNLOCK (&priv->lock); + +out: + return ret; +} + +struct xlator_fops fops = { + .stat = stripe_stat, + .unlink = stripe_unlink, + .rename = stripe_rename, + .link = stripe_link, + .truncate = stripe_truncate, + .create = stripe_create, + .open = stripe_open, + .readv = stripe_readv, + .writev = stripe_writev, + .statfs = stripe_statfs, + .flush = stripe_flush, + .fsync = stripe_fsync, + .ftruncate = stripe_ftruncate, + .fstat = stripe_fstat, + .mkdir = stripe_mkdir, + .rmdir = stripe_rmdir, + .lk = stripe_lk, + .opendir = stripe_opendir, + .fsyncdir = stripe_fsyncdir, + .setattr = stripe_setattr, + .fsetattr = stripe_fsetattr, + .lookup = stripe_lookup, + .mknod = stripe_mknod, + .setxattr = stripe_setxattr, + .fsetxattr = stripe_fsetxattr, + .getxattr = stripe_getxattr, + .fgetxattr = stripe_fgetxattr, + .removexattr = stripe_removexattr, + .fremovexattr = stripe_fremovexattr, + .readdirp = stripe_readdirp, + .fallocate = stripe_fallocate, + .discard = stripe_discard, + .zerofill = stripe_zerofill, }; struct xlator_cbks cbks = { .release = stripe_release, + .forget = stripe_forget, }; +struct xlator_dumpops dumpops = { + .priv = stripe_priv_dump, +}; struct volume_options options[] = { - { .key = {"block-size"}, - .type = GF_OPTION_TYPE_ANY + { .key = {"block-size"}, + .type = GF_OPTION_TYPE_SIZE_LIST, + .default_value = "128KB", + .min = STRIPE_MIN_BLOCK_SIZE, + .description = "Size of the stripe unit that would be read " + "from or written to the striped servers." }, - { .key = {"use-xattr"}, - .type = GF_OPTION_TYPE_BOOL + { .key = {"use-xattr"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true" }, + { .key = {"coalesce"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Enable/Disable coalesce mode to flatten striped " + "files as stored on the server (i.e., eliminate holes " + "caused by the traditional format)." + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h index 5ffbc3670..5673d18f3 100644 --- a/xlators/cluster/stripe/src/stripe.h +++ b/xlators/cluster/stripe/src/stripe.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -32,12 +23,63 @@ #include "common-utils.h" #include "compat.h" #include "compat-errno.h" +#include "stripe-mem-types.h" +#include "libxlator.h" #include <fnmatch.h> #include <signal.h> +#define STRIPE_PATHINFO_HEADER "STRIPE:" +#define STRIPE_MIN_BLOCK_SIZE (16*GF_UNIT_KB) + +#define STRIPE_STACK_UNWIND(fop, frame, params ...) do { \ + stripe_local_t *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + if (__local) { \ + stripe_local_wipe(__local); \ + mem_put (__local); \ + } \ + } while (0) + +#define STRIPE_STACK_DESTROY(frame) do { \ + stripe_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + if (__local) { \ + stripe_local_wipe (__local); \ + mem_put (__local); \ + } \ + } while (0) + +#define STRIPE_VALIDATE_FCTX(fctx, label) do { \ + int idx = 0; \ + if (!fctx) { \ + op_errno = EINVAL; \ + goto label; \ + } \ + for (idx = 0; idx < fctx->stripe_count; idx++) { \ + if (!fctx->xl_array[idx]) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "fctx->xl_array[%d] is NULL", \ + idx); \ + op_errno = ESTALE; \ + goto label; \ + } \ + } \ + } while (0) + +typedef struct stripe_xattr_sort { + int pos; + int xattr_len; + char *xattr_value; +} stripe_xattr_sort_t; /** - * struct stripe_options : This keeps the pattern and the block-size + * struct stripe_options : This keeps the pattern and the block-size * information, which is used for striping on a file. */ struct stripe_options { @@ -47,7 +89,7 @@ struct stripe_options { }; /** - * Private structure for stripe translator + * Private structure for stripe translator */ struct stripe_private { struct stripe_options *pattern; @@ -56,27 +98,31 @@ struct stripe_private { gf_lock_t lock; uint8_t nodes_down; int8_t first_child_down; + int *last_event; int8_t child_count; - int8_t *state; /* Current state of child node */ gf_boolean_t xattr_supported; /* default yes */ + gf_boolean_t coalesce; + char vol_uuid[UUID_SIZE + 1]; }; /** - * Used to keep info about the replies received from fops->readv calls + * Used to keep info about the replies received from readv/writev calls */ -struct readv_replies { +struct stripe_replies { struct iovec *vector; int32_t count; //count of vector int32_t op_ret; //op_ret of readv int32_t op_errno; - struct stat stbuf; /* 'stbuf' is also a part of reply */ + int32_t requested_size; + struct iatt stbuf; /* 'stbuf' is also a part of reply */ }; typedef struct _stripe_fd_ctx { off_t stripe_size; int stripe_count; + int stripe_coalesce; int static_array; - xlator_t **xl_array; + xlator_t **xl_array; } stripe_fd_ctx_t; @@ -87,28 +133,45 @@ struct stripe_local; /* this itself is used inside the structure; */ struct stripe_local { struct stripe_local *next; - call_frame_t *orig_frame; - + call_frame_t *orig_frame; + stripe_fd_ctx_t *fctx; /* Used by _cbk functions */ - struct stat stbuf; - struct readv_replies *replies; - struct statvfs statvfs_buf; - dir_entry_t *entry; - struct xlator_stats stats; + struct iatt stbuf; + struct iatt pre_buf; + struct iatt post_buf; + struct iatt preparent; + struct iatt postparent; + + off_t stbuf_size; + off_t prebuf_size; + off_t postbuf_size; + off_t preparent_size; + off_t postparent_size; + + blkcnt_t stbuf_blocks; + blkcnt_t prebuf_blocks; + blkcnt_t postbuf_blocks; + blkcnt_t preparent_blocks; + blkcnt_t postparent_blocks; + + struct stripe_replies *replies; + struct statvfs statvfs_buf; + dir_entry_t *entry; int8_t revalidate; int8_t failed; int8_t unwind; + size_t readv_size; int32_t entry_count; int32_t node_index; int32_t call_count; - int32_t wind_count; /* used instead of child_cound + int32_t wind_count; /* used instead of child_cound in case of read and write */ int32_t op_ret; - int32_t op_errno; + int32_t op_errno; int32_t count; int32_t flags; char *name; @@ -117,8 +180,17 @@ struct stripe_local { loc_t loc; loc_t loc2; + mode_t mode; + dev_t rdev; /* For File I/O fops */ - dict_t *dict; + dict_t *xdata; + + stripe_xattr_sort_t *xattr_list; + int32_t xattr_total_len; + int32_t nallocs; + char xsel[256]; + + struct marker_str marker; /* General usage */ off_t offset; @@ -128,13 +200,89 @@ struct stripe_local { int entry_self_heal_needed; int8_t *list; - struct flock lock; + struct gf_flock lock; fd_t *fd; void *value; struct iobref *iobref; + gf_dirent_t entries; + gf_dirent_t *dirent; + dict_t *xattr; + uuid_t ia_gfid; + + int xflag; + mode_t umask; }; typedef struct stripe_local stripe_local_t; typedef struct stripe_private stripe_private_t; +/* + * Determine the stripe index of a particular frame based on the translator. + */ +static inline int32_t stripe_get_frame_index(stripe_fd_ctx_t *fctx, + call_frame_t *prev) +{ + int32_t i, idx = -1; + + for (i = 0; i < fctx->stripe_count; i++) { + if (fctx->xl_array[i] == prev->this) { + idx = i; + break; + } + } + + return idx; +} + +static inline void stripe_copy_xl_array(xlator_t **dst, xlator_t **src, + int count) +{ + int i; + + for (i = 0; i < count; i++) + dst[i] = src[i]; +} + +void stripe_local_wipe (stripe_local_t *local); +int32_t stripe_ctx_handle (xlator_t *this, call_frame_t *prev, + stripe_local_t *local, dict_t *dict); +void stripe_aggregate_xattr (dict_t *dst, dict_t *src); +int32_t stripe_xattr_request_build (xlator_t *this, dict_t *dict, + uint64_t stripe_size, uint32_t stripe_count, + uint32_t stripe_index, + uint32_t stripe_coalesce); +int32_t stripe_get_matching_bs (const char *path, stripe_private_t *priv); +int set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data); +int32_t stripe_iatt_merge (struct iatt *from, struct iatt *to); +int32_t stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local, + char **xattr_serz); +int32_t stripe_free_xattr_str (stripe_local_t *local); +int32_t stripe_xattr_aggregate (char *buffer, stripe_local_t *local, + int32_t *total); +off_t coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count); +off_t uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count, + int stripe_index); +int32_t +stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local, + void **xattr_serz); + +/* + * Adjust the size attribute for files if coalesce is enabled. + */ +static inline void correct_file_size(struct iatt *buf, stripe_fd_ctx_t *fctx, + call_frame_t *prev) +{ + int index; + + if (!IA_ISREG(buf->ia_type)) + return; + + if (!fctx || !fctx->stripe_coalesce) + return; + + index = stripe_get_frame_index(fctx, prev); + buf->ia_size = uncoalesced_size(buf->ia_size, fctx->stripe_size, + fctx->stripe_count, index); +} + #endif /* _STRIPE_H_ */ diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am deleted file mode 100644 index b9e6f63e9..000000000 --- a/xlators/cluster/unify/src/Makefile.am +++ /dev/null @@ -1,16 +0,0 @@ - -xlator_LTLIBRARIES = unify.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster - -unify_la_LDFLAGS = -module -avoidversion - -unify_la_SOURCES = unify.c unify-self-heal.c -unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = unify.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c deleted file mode 100644 index 4e788fc7a..000000000 --- a/xlators/cluster/unify/src/unify-self-heal.c +++ /dev/null @@ -1,1225 +0,0 @@ -/* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * unify-self-heal.c : - * This file implements few functions which enables 'unify' translator - * to be consistent in its behaviour when - * > a node fails, - * > a node gets added, - * > a failed node comes back - * > a new namespace server is added (ie, an fresh namespace server). - * - * This functionality of 'unify' will enable glusterfs to support storage - * system failure, and maintain consistancy. This works both ways, ie, when - * an entry (either file or directory) is found on namespace server, and not - * on storage nodes, its created in storage nodes and vica-versa. - * - * The two fops, where it can be implemented are 'getdents ()' and 'lookup ()' - * - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "unify.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "common-utils.h" - -int32_t -unify_sh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_sh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_bgsh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_bgsh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -/** - * unify_local_wipe - free all the extra allocation of local->* here. - */ -static void -unify_local_wipe (unify_local_t *local) -{ - /* Free the strdup'd variables in the local structure */ - if (local->name) { - FREE (local->name); - } - - if (local->sh_struct) { - if (local->sh_struct->offset_list) - FREE (local->sh_struct->offset_list); - - if (local->sh_struct->entry_list) - FREE (local->sh_struct->entry_list); - - if (local->sh_struct->count_list) - FREE (local->sh_struct->count_list); - - FREE (local->sh_struct); - } - - loc_wipe (&local->loc1); - loc_wipe (&local->loc2); -} - -int32_t -unify_sh_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - /* if local->call_count == 0, that means, setdents on - * storagenodes is still pending. - */ - if (local->call_count) - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (callcnt == 0) { - if (local->sh_struct->entry_list[0]) { - prev = entry = local->sh_struct->entry_list[0]; - if (!entry) - return 0; - trav = entry->next; - while (trav) { - prev->next = trav->next; - FREE (trav->name); - if (S_ISLNK (trav->buf.st_mode)) - FREE (trav->link); - FREE (trav); - trav = prev->next; - } - FREE (entry); - } - - if (!local->flags) { - if (local->sh_struct->count_list[0] >= - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - /* count == size, that means, there are more entries - to read from */ - //local->call_count = 0; - local->sh_struct->offset_list[0] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[0], - GF_GET_DIR_ONLY); - } - } else { - inode = local->loc1.inode; - fd_unref (local->fd); - tmp_dict = local->dict; - - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - inode, &local->stbuf, local->dict); - if (tmp_dict) - dict_unref (tmp_dict); - } - } - - return 0; -} - - -int32_t -unify_sh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = 0; - unsigned long final = 0; - dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t)); - - local->sh_struct->entry_list[0] = tmp; - local->sh_struct->count_list[0] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - - if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { - final = 1; - } - - LOCK (&frame->lock); - { - /* local->call_count will be '0' till now. make it 1 so, it - can be UNWIND'ed for the last call. */ - local->call_count = priv->child_count; - if (final) - local->flags = 1; - } - UNLOCK (&frame->lock); - - for (index = 0; index < priv->child_count; index++) - { - STACK_WIND_COOKIE (frame, - unify_sh_setdents_cbk, - (void *)index, - priv->xl_array[index], - priv->xl_array[index]->fops->setdents, - local->fd, GF_SET_DIR_ONLY, - local->sh_struct->entry_list[0], count); - } - - return 0; -} - -int32_t -unify_sh_ns_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - if (local->sh_struct->entry_list[index]) { - prev = entry = local->sh_struct->entry_list[index]; - trav = entry->next; - while (trav) { - prev->next = trav->next; - FREE (trav->name); - if (S_ISLNK (trav->buf.st_mode)) - FREE (trav->link); - FREE (trav); - trav = prev->next; - } - FREE (entry); - } - } - UNLOCK (&frame->lock); - - if (local->sh_struct->count_list[index] < - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - - -/** - * unify_sh_getdents_cbk - - */ -int32_t -unify_sh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *tmp = NULL; - - if (op_ret >= 0 && count > 0) { - /* There is some dentry found, just send the dentry to NS */ - tmp = CALLOC (1, sizeof (dir_entry_t)); - local->sh_struct->entry_list[index] = tmp; - local->sh_struct->count_list[index] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - STACK_WIND_COOKIE (frame, - unify_sh_ns_setdents_cbk, - cookie, - NS(this), - NS(this)->fops->setdents, - local->fd, - GF_SET_IF_NOT_PRESENT, - local->sh_struct->entry_list[index], - count); - return 0; - } - - if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - -/** - * unify_sh_opendir_cbk - - * - * @cookie: - */ -int32_t -unify_sh_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret >= 0) { - local->op_ret = op_ret; - } else { - gf_log (this->name, GF_LOG_WARNING, "failed"); - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->call_count = priv->child_count + 1; - - if (!local->failed) { - /* send getdents() namespace after finishing - storage nodes */ - local->call_count--; - - fd_bind (fd); - - if (local->call_count) { - /* Used as the offset index. This list keeps - * track of offset sent to each node during - * STACK_WIND. - */ - local->sh_struct->offset_list = - calloc (priv->child_count, - sizeof (off_t)); - ERR_ABORT (local->sh_struct->offset_list); - - local->sh_struct->entry_list = - calloc (priv->child_count, - sizeof (dir_entry_t *)); - ERR_ABORT (local->sh_struct->entry_list); - - local->sh_struct->count_list = - calloc (priv->child_count, - sizeof (int)); - ERR_ABORT (local->sh_struct->count_list); - - /* Send getdents on all the fds */ - for (index = 0; - index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_ALL); - } - - /* did stack wind, so no need to unwind here */ - return 0; - } /* (local->call_count) */ - } /* (!local->failed) */ - - /* Opendir failed on one node. */ - inode = local->loc1.inode; - fd_unref (local->fd); - tmp_dict = local->dict; - - unify_local_wipe (local); - /* Only 'self-heal' failed, lookup() was successful. */ - local->op_ret = 0; - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, local->op_ret, local->op_errno, inode, - &local->stbuf, local->dict); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - -/** - * gf_sh_checksum_cbk - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -unify_sh_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - int32_t callcnt = 0; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (NS(this) == (xlator_t *)cookie) { - memcpy (local->sh_struct->ns_file_checksum, - file_checksum, ZR_FILENAME_MAX); - memcpy (local->sh_struct->ns_dir_checksum, - dir_checksum, ZR_FILENAME_MAX); - } else { - if (local->entry_count == 0) { - /* Initialize the dir_checksum to be - * used for comparision with other - * storage nodes. Should be done for - * the first successful call *only*. - */ - /* Using 'entry_count' as a flag */ - local->entry_count = 1; - memcpy (local->sh_struct->dir_checksum, - dir_checksum, ZR_FILENAME_MAX); - } - - /* Reply from the storage nodes */ - for (index = 0; - index < ZR_FILENAME_MAX; index++) { - /* Files should be present in - only one node */ - local->sh_struct->file_checksum[index] ^= file_checksum[index]; - - /* directory structure should be - same accross */ - if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) - local->failed = 1; - } - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - for (index = 0; index < ZR_FILENAME_MAX ; index++) { - if (local->sh_struct->file_checksum[index] != - local->sh_struct->ns_file_checksum[index]) { - local->failed = 1; - break; - } - if (local->sh_struct->dir_checksum[index] != - local->sh_struct->ns_dir_checksum[index]) { - local->failed = 1; - break; - } - } - - if (local->failed) { - /* Log it, it should be a rare event */ - gf_log (this->name, GF_LOG_WARNING, - "Self-heal triggered on directory %s", - local->loc1.path); - - /* Any self heal will be done at directory level */ - local->call_count = 0; - local->op_ret = -1; - local->failed = 0; - - local->fd = fd_create (local->loc1.inode, - frame->root->pid); - - local->call_count = priv->child_count + 1; - - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_sh_opendir_cbk, - priv->xl_array[index]->name, - priv->xl_array[index], - priv->xl_array[index]->fops->opendir, - &local->loc1, - local->fd); - } - /* opendir can be done on the directory */ - return 0; - } - - /* no mismatch */ - inode = local->loc1.inode; - tmp_dict = local->dict; - - unify_local_wipe (local); - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - inode, - &local->stbuf, - local->dict); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - -/* Foreground self-heal part over */ - -/* Background self-heal part */ - -int32_t -unify_bgsh_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - /* if local->call_count == 0, that means, setdents - on storagenodes is still pending. */ - if (local->call_count) - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - - if (callcnt == 0) { - if (local->sh_struct->entry_list[0]) { - prev = entry = local->sh_struct->entry_list[0]; - trav = entry->next; - while (trav) { - prev->next = trav->next; - FREE (trav->name); - if (S_ISLNK (trav->buf.st_mode)) - FREE (trav->link); - FREE (trav); - trav = prev->next; - } - FREE (entry); - } - - if (!local->flags) { - if (local->sh_struct->count_list[0] >= - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - /* count == size, that means, there are more - entries to read from */ - //local->call_count = 0; - local->sh_struct->offset_list[0] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[0], - GF_GET_DIR_ONLY); - } - } else { - fd_unref (local->fd); - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - } - - return 0; -} - - -int32_t -unify_bgsh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = 0; - unsigned long final = 0; - dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t)); - - local->sh_struct->entry_list[0] = tmp; - local->sh_struct->count_list[0] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - - if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { - final = 1; - } - - LOCK (&frame->lock); - { - /* local->call_count will be '0' till now. make it 1 so, - it can be UNWIND'ed for the last call. */ - local->call_count = priv->child_count; - if (final) - local->flags = 1; - } - UNLOCK (&frame->lock); - - for (index = 0; index < priv->child_count; index++) - { - STACK_WIND_COOKIE (frame, - unify_bgsh_setdents_cbk, - (void *)index, - priv->xl_array[index], - priv->xl_array[index]->fops->setdents, - local->fd, GF_SET_DIR_ONLY, - local->sh_struct->entry_list[0], count); - } - - return 0; -} - -int32_t -unify_bgsh_ns_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *prev, *entry, *trav; - - if (local->sh_struct->entry_list[index]) { - prev = entry = local->sh_struct->entry_list[index]; - if (!entry) - return 0; - trav = entry->next; - while (trav) { - prev->next = trav->next; - FREE (trav->name); - if (S_ISLNK (trav->buf.st_mode)) - FREE (trav->link); - FREE (trav); - trav = prev->next; - } - FREE (entry); - } - - if (local->sh_struct->count_list[index] < - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - - -/** - * unify_bgsh_getdents_cbk - - */ -int32_t -unify_bgsh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *tmp = NULL; - - if (op_ret >= 0 && count > 0) { - /* There is some dentry found, just send the dentry to NS */ - tmp = CALLOC (1, sizeof (dir_entry_t)); - local->sh_struct->entry_list[index] = tmp; - local->sh_struct->count_list[index] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - STACK_WIND_COOKIE (frame, - unify_bgsh_ns_setdents_cbk, - cookie, - NS(this), - NS(this)->fops->setdents, - local->fd, - GF_SET_IF_NOT_PRESENT, - local->sh_struct->entry_list[index], - count); - return 0; - } - - if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - -/** - * unify_bgsh_opendir_cbk - - * - * @cookie: - */ -int32_t -unify_bgsh_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int32_t callcnt = 0; - int16_t index = 0; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret >= 0) { - local->op_ret = op_ret; - } else { - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->call_count = priv->child_count + 1; - - if (!local->failed) { - /* send getdents() namespace after finishing - storage nodes */ - local->call_count--; - callcnt = local->call_count; - - fd_bind (fd); - - if (local->call_count) { - /* Used as the offset index. This list keeps - track of offset sent to each node during - STACK_WIND. */ - local->sh_struct->offset_list = - calloc (priv->child_count, - sizeof (off_t)); - ERR_ABORT (local->sh_struct->offset_list); - - local->sh_struct->entry_list = - calloc (priv->child_count, - sizeof (dir_entry_t *)); - ERR_ABORT (local->sh_struct->entry_list); - - local->sh_struct->count_list = - calloc (priv->child_count, - sizeof (int)); - ERR_ABORT (local->sh_struct->count_list); - - /* Send getdents on all the fds */ - for (index = 0; - index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_ALL); - } - /* did a stack wind, so no need to unwind here */ - return 0; - } /* (local->call_count) */ - } /* (!local->failed) */ - - /* Opendir failed on one node. */ - fd_unref (local->fd); - - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - - return 0; -} - -/** - * gf_bgsh_checksum_cbk - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -unify_bgsh_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - int32_t callcnt = 0; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (NS(this) == (xlator_t *)cookie) { - memcpy (local->sh_struct->ns_file_checksum, - file_checksum, ZR_FILENAME_MAX); - memcpy (local->sh_struct->ns_dir_checksum, - dir_checksum, ZR_FILENAME_MAX); - } else { - if (local->entry_count == 0) { - /* Initialize the dir_checksum to be - * used for comparision with other - * storage nodes. Should be done for - * the first successful call *only*. - */ - /* Using 'entry_count' as a flag */ - local->entry_count = 1; - memcpy (local->sh_struct->dir_checksum, - dir_checksum, ZR_FILENAME_MAX); - } - - /* Reply from the storage nodes */ - for (index = 0; - index < ZR_FILENAME_MAX; index++) { - /* Files should be present in only - one node */ - local->sh_struct->file_checksum[index] ^= file_checksum[index]; - - /* directory structure should be same - accross */ - if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) - local->failed = 1; - } - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - for (index = 0; index < ZR_FILENAME_MAX ; index++) { - if (local->sh_struct->file_checksum[index] != - local->sh_struct->ns_file_checksum[index]) { - local->failed = 1; - break; - } - if (local->sh_struct->dir_checksum[index] != - local->sh_struct->ns_dir_checksum[index]) { - local->failed = 1; - break; - } - } - - if (local->failed) { - /* Log it, it should be a rare event */ - gf_log (this->name, GF_LOG_WARNING, - "Self-heal triggered on directory %s", - local->loc1.path); - - /* Any self heal will be done at the directory level */ - local->op_ret = -1; - local->failed = 0; - - local->fd = fd_create (local->loc1.inode, - frame->root->pid); - local->call_count = priv->child_count + 1; - - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_bgsh_opendir_cbk, - priv->xl_array[index]->name, - priv->xl_array[index], - priv->xl_array[index]->fops->opendir, - &local->loc1, - local->fd); - } - - /* opendir can be done on the directory */ - return 0; - } - - /* no mismatch */ - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - - return 0; -} - -/* Background self-heal part over */ - - - - -/** - * zr_unify_self_heal - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -zr_unify_self_heal (call_frame_t *frame, - xlator_t *this, - unify_local_t *local) -{ - unify_private_t *priv = this->private; - call_frame_t *bg_frame = NULL; - unify_local_t *bg_local = NULL; - inode_t *tmp_inode = NULL; - dict_t *tmp_dict = NULL; - int16_t index = 0; - - if (local->inode_generation < priv->inode_generation) { - /* Any self heal will be done at the directory level */ - /* Update the inode's generation to the current generation - value. */ - local->inode_generation = priv->inode_generation; - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->inode_generation); - - if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) { - local->op_ret = 0; - local->failed = 0; - local->call_count = priv->child_count + 1; - local->sh_struct = - calloc (1, sizeof (struct unify_self_heal_struct)); - - /* +1 is for NS */ - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_sh_checksum_cbk, - priv->xl_array[index], - priv->xl_array[index], - priv->xl_array[index]->fops->checksum, - &local->loc1, - 0); - } - - /* Self-heal in foreground, hence no need - to UNWIND here */ - return 0; - } - - /* Self Heal done in background */ - bg_frame = copy_frame (frame); - INIT_LOCAL (bg_frame, bg_local); - loc_copy (&bg_local->loc1, &local->loc1); - bg_local->op_ret = 0; - bg_local->failed = 0; - bg_local->call_count = priv->child_count + 1; - bg_local->sh_struct = - calloc (1, sizeof (struct unify_self_heal_struct)); - - /* +1 is for NS */ - for (index = 0; index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (bg_frame, - unify_bgsh_checksum_cbk, - priv->xl_array[index], - priv->xl_array[index], - priv->xl_array[index]->fops->checksum, - &bg_local->loc1, - 0); - } - } - - /* generation number matches, self heal already done or - * self heal done in background: just do STACK_UNWIND - */ - tmp_inode = local->loc1.inode; - tmp_dict = local->dict; - - unify_local_wipe (local); - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - tmp_inode, - &local->stbuf, - local->dict); - - if (tmp_dict) - dict_unref (tmp_dict); - - return 0; -} - diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c deleted file mode 100644 index 35c108963..000000000 --- a/xlators/cluster/unify/src/unify.c +++ /dev/null @@ -1,4506 +0,0 @@ -/* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * xlators/cluster/unify: - * - This xlator is one of the main translator in GlusterFS, which - * actually does the clustering work of the file system. One need to - * understand that, unify assumes file to be existing in only one of - * the child node, and directories to be present on all the nodes. - * - * NOTE: - * Now, unify has support for global namespace, which is used to keep a - * global view of fs's namespace tree. The stat for directories are taken - * just from the namespace, where as for files, just 'st_ino' is taken from - * Namespace node, and other stat info is taken from the actual storage node. - * Also Namespace node helps to keep consistant inode for files across - * glusterfs (re-)mounts. - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "unify.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "defaults.h" -#include "common-utils.h" -#include <signal.h> -#include <libgen.h> -#include "compat-errno.h" -#include "compat.h" - -#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \ - if (!(_loc && _loc->inode)) { \ - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \ - return 0; \ - } \ -} while(0) - - -#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \ - if (!(_fd && !fd_ctx_get (_fd, this, NULL))) { \ - STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ - return 0; \ - } \ -} while(0) - -#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \ - if (!_fd) { \ - STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ - return 0; \ - } \ -} while(0) - -/** - * unify_local_wipe - free all the extra allocation of local->* here. - */ -static void -unify_local_wipe (unify_local_t *local) -{ - /* Free the strdup'd variables in the local structure */ - if (local->name) { - FREE (local->name); - } - loc_wipe (&local->loc1); - loc_wipe (&local->loc2); -} - - - -/* - * unify_normalize_stats - - */ -void -unify_normalize_stats (struct statvfs *buf, - unsigned long bsize, - unsigned long frsize) -{ - double factor; - - if (buf->f_bsize != bsize) { - factor = ((double) buf->f_bsize) / bsize; - buf->f_bsize = bsize; - buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); - buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); - } - - if (buf->f_frsize != frsize) { - factor = ((double) buf->f_frsize) / frsize; - buf->f_frsize = frsize; - buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); - } -} - - -xlator_t * -unify_loc_subvol (loc_t *loc, xlator_t *this) -{ - unify_private_t *priv = NULL; - xlator_t *subvol = NULL; - int16_t *list = NULL; - long index = 0; - xlator_t *subvol_i = NULL; - int ret = 0; - uint64_t tmp_list = 0; - - priv = this->private; - subvol = NS (this); - - if (!S_ISDIR (loc->inode->st_mode)) { - ret = inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - if (!list) - goto out; - - for (index = 0; list[index] != -1; index++) { - subvol_i = priv->xl_array[list[index]]; - if (subvol_i != NS (this)) { - subvol = subvol_i; - break; - } - } - } -out: - return subvol; -} - - - -/** - * unify_statfs_cbk - - */ -int32_t -unify_statfs_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct statvfs *stbuf) -{ - int32_t callcnt = 0; - struct statvfs *dict_buf = NULL; - unsigned long bsize; - unsigned long frsize; - unify_local_t *local = (unify_local_t *)frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - /* when a call is successfull, add it to local->dict */ - dict_buf = &local->statvfs_buf; - - if (dict_buf->f_bsize != 0) { - bsize = max (dict_buf->f_bsize, - stbuf->f_bsize); - - frsize = max (dict_buf->f_frsize, - stbuf->f_frsize); - unify_normalize_stats(dict_buf, bsize, frsize); - unify_normalize_stats(stbuf, bsize, frsize); - } else { - dict_buf->f_bsize = stbuf->f_bsize; - dict_buf->f_frsize = stbuf->f_frsize; - } - - dict_buf->f_blocks += stbuf->f_blocks; - dict_buf->f_bfree += stbuf->f_bfree; - dict_buf->f_bavail += stbuf->f_bavail; - dict_buf->f_files += stbuf->f_files; - dict_buf->f_ffree += stbuf->f_ffree; - dict_buf->f_favail += stbuf->f_favail; - dict_buf->f_fsid = stbuf->f_fsid; - dict_buf->f_flag = stbuf->f_flag; - dict_buf->f_namemax = stbuf->f_namemax; - local->op_ret = op_ret; - } else { - /* fop on storage node has failed due to some error */ - if (op_errno != ENOTCONN) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): %s", - prev_frame->this->name, - strerror (op_errno)); - } - local->op_errno = op_errno; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->statvfs_buf); - } - - return 0; -} - -/** - * unify_statfs - - */ -int32_t -unify_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - xlator_list_t *trav = this->children; - - INIT_LOCAL (frame, local); - local->call_count = ((unify_private_t *)this->private)->child_count; - - while(trav) { - STACK_WIND (frame, - unify_statfs_cbk, - trav->xlator, - trav->xlator->fops->statfs, - loc); - trav = trav->next; - } - - return 0; -} - -/** - * unify_buf_cbk - - */ -int32_t -unify_buf_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s(): child(%s): path(%s): %s", - gf_fop_list[frame->root->op], - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - - local->op_errno = op_errno; - if ((op_errno == ENOENT) && priv->optimist) - local->op_ret = 0; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (NS (this) == prev_frame->this) { - local->st_ino = buf->st_ino; - /* If the entry is directory, get the stat - from NS node */ - if (S_ISDIR (buf->st_mode) || - !local->stbuf.st_blksize) { - local->stbuf = *buf; - } - } - - if ((!S_ISDIR (buf->st_mode)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from Storage - node. */ - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - /* If the inode number is not filled, operation should - fail */ - if (!local->st_ino) - local->op_ret = -1; - - local->stbuf.st_ino = local->st_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - - return 0; -} - -#define check_if_dht_linkfile(s) ((s->st_mode & ~S_IFMT) == S_ISVTX) - -/** - * unify_lookup_cbk - - */ -int32_t -unify_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf, - dict_t *dict) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - inode_t *tmp_inode = NULL; - dict_t *local_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - if (local->revalidate && - (op_errno == ESTALE)) { - /* ESTALE takes priority */ - local->op_errno = op_errno; - local->failed = 1; - } - - if ((op_errno != ENOTCONN) - && (op_errno != ENOENT) - && (local->op_errno != ESTALE)) { - /* if local->op_errno is already ESTALE, then - * ESTALE has to propogated to the parent first. - * do not enter here. - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - - } else if (local->revalidate && - (local->op_errno != ESTALE) && - !(priv->optimist && (op_errno == ENOENT))) { - - gf_log (this->name, - (op_errno == ENOTCONN) ? - GF_LOG_DEBUG:GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - } - - if (op_ret == 0) { - local->op_ret = 0; - - if (check_if_dht_linkfile(buf)) { - gf_log (this->name, GF_LOG_CRITICAL, - "file %s may be DHT link file on %s, " - "make sure the backend is not shared " - "between unify and DHT", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - } - - if (local->stbuf.st_mode && local->stbuf.st_blksize) { - /* make sure we already have a stbuf - stored in local->stbuf */ - if (S_ISDIR (local->stbuf.st_mode) && - !S_ISDIR (buf->st_mode)) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] '%s' is directory " - "on namespace, non-directory " - "on node '%s', returning EIO", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - local->return_eio = 1; - } - if (!S_ISDIR (local->stbuf.st_mode) && - S_ISDIR (buf->st_mode)) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] '%s' is directory " - "on node '%s', non-directory " - "on namespace, returning EIO", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - local->return_eio = 1; - } - } - - if (!local->revalidate && !S_ISDIR (buf->st_mode)) { - /* This is the first time lookup on file*/ - if (!local->list) { - /* list is not allocated, allocate - the max possible range */ - local->list = CALLOC (1, 2 * (priv->child_count + 2)); - if (!local->list) { - gf_log (this->name, - GF_LOG_CRITICAL, - "Not enough memory"); - STACK_UNWIND (frame, -1, - ENOMEM, inode, - NULL, NULL); - return 0; - } - } - /* update the index of the list */ - local->list [local->index++] = - (int16_t)(long)cookie; - } - - if (!local->revalidate && S_ISDIR (buf->st_mode)) { - /* fresh lookup of a directory */ - inode_ctx_put (local->loc1.inode, this, - priv->inode_generation); - } - - if ((!local->dict) && dict && - (priv->xl_array[(long)cookie] != NS(this))) { - local->dict = dict_ref (dict); - } - - /* index of NS node is == total child count */ - if (priv->child_count == (int16_t)(long)cookie) { - /* Take the inode number from namespace */ - local->st_ino = buf->st_ino; - if (S_ISDIR (buf->st_mode) || - !(local->stbuf.st_blksize)) { - local->stbuf = *buf; - } - } else if (!S_ISDIR (buf->st_mode)) { - /* If file, then get the stat from - storage node */ - local->stbuf = *buf; - } - - if (local->st_nlink < buf->st_nlink) { - local->st_nlink = buf->st_nlink; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_dict = local->dict; - if (local->return_eio) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] Unable to fix the path (%s) with " - "self-heal, try manual verification. " - "returning EIO.", local->loc1.path); - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL); - if (local_dict) { - dict_unref (local_dict); - } - return 0; - } - - if (!local->stbuf.st_blksize) { - /* Inode not present */ - local->op_ret = -1; - } else { - if (!local->revalidate && - !S_ISDIR (local->stbuf.st_mode)) { - /* If its a file, big array is useless, - allocate the smaller one */ - int16_t *list = NULL; - list = CALLOC (1, 2 * (local->index + 1)); - ERR_ABORT (list); - memcpy (list, local->list, 2 * local->index); - /* Make the end of the list as -1 */ - FREE (local->list); - local->list = list; - local->list [local->index] = -1; - /* Update the inode's ctx with proper array */ - /* TODO: log on failure */ - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->list); - } - - if (S_ISDIR(local->loc1.inode->st_mode)) { - /* lookup is done for directory */ - if (local->failed && priv->self_heal) { - /* Triggering self-heal */ - /* means, self-heal required for this - inode */ - local->inode_generation = 0; - priv->inode_generation++; - } - } else { - local->stbuf.st_ino = local->st_ino; - } - - local->stbuf.st_nlink = local->st_nlink; - } - if (local->op_ret == -1) { - if (!local->revalidate && local->list) - FREE (local->list); - } - - if ((local->op_ret >= 0) && local->failed && - local->revalidate) { - /* Done revalidate, but it failed */ - if ((op_errno != ENOTCONN) - && (local->op_errno != ESTALE)) { - gf_log (this->name, GF_LOG_ERROR, - "Revalidate failed for path(%s): %s", - local->loc1.path, strerror (op_errno)); - } - local->op_ret = -1; - } - - if ((priv->self_heal && !priv->optimist) && - (!local->revalidate && (local->op_ret == 0) && - S_ISDIR(local->stbuf.st_mode))) { - /* Let the self heal be done here */ - zr_unify_self_heal (frame, this, local); - local_dict = NULL; - } else { - if (local->failed) { - /* NOTE: directory lookup is sent to all - * subvolumes and success from a subvolume - * might set local->op_ret to 0 (zero) */ - local->op_ret = -1; - } - - /* either no self heal, or op_ret == -1 (failure) */ - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - tmp_inode, &local->stbuf, local->dict); - } - if (local_dict) { - dict_unref (local_dict); - } - } - - return 0; -} - -/** - * unify_lookup - - */ -int32_t -unify_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int16_t *list = NULL; - long index = 0; - - if (!(loc && loc->inode)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: Argument not right", loc?loc->path:"(null)"); - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); - return 0; - } - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL); - return 0; - } - - if (inode_ctx_get (loc->inode, this, NULL) - && S_ISDIR (loc->inode->st_mode)) { - local->revalidate = 1; - } - - if (!inode_ctx_get (loc->inode, this, NULL) && - loc->inode->st_mode && - !S_ISDIR (loc->inode->st_mode)) { - uint64_t tmp_list = 0; - /* check if revalidate or fresh lookup */ - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - } - - if (local->list) { - list = local->list; - for (index = 0; list[index] != -1; index++); - if (index != 2) { - if (index < 2) { - gf_log (this->name, GF_LOG_ERROR, - "returning ESTALE for %s: file " - "count is %ld", loc->path, index); - /* Print where all the file is present */ - for (index = 0; - local->list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", loc->path, - priv->xl_array[list[index]]->name); - } - unify_local_wipe (local); - STACK_UNWIND (frame, -1, ESTALE, - NULL, NULL, NULL); - return 0; - } else { - /* There are more than 2 presences */ - /* Just log and continue */ - gf_log (this->name, GF_LOG_ERROR, - "%s: file count is %ld", - loc->path, index); - /* Print where all the file is present */ - for (index = 0; - local->list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", loc->path, - priv->xl_array[list[index]]->name); - } - } - } - - /* is revalidate */ - local->revalidate = 1; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_lookup_cbk, - (void *)(long)list[index], //cookie - priv->xl_array [list[index]], - priv->xl_array [list[index]]->fops->lookup, - loc, - xattr_req); - if (need_break) - break; - } - } else { - if (loc->inode->st_mode) { - if (inode_ctx_get (loc->inode, this, NULL)) { - inode_ctx_get (loc->inode, this, - &local->inode_generation); - } - } - /* This is first call, there is no list */ - /* call count should be all child + 1 namespace */ - local->call_count = priv->child_count + 1; - - for (index = 0; index <= priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_lookup_cbk, - (void *)index, //cookie - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - loc, - xattr_req); - } - } - - return 0; -} - -/** - * unify_stat - if directory, get the stat directly from NameSpace child. - * if file, check for a hint and send it only there (also to NS). - * if its a fresh stat, then do it on all the nodes. - * - * NOTE: for all the call, sending cookie as xlator pointer, which will be - * used in cbk. - */ -int32_t -unify_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int16_t index = 0; - int16_t *list = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - local->st_ino = loc->inode->ino; - if (S_ISDIR (loc->inode->st_mode)) { - /* Directory */ - local->call_count = 1; - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->stat, loc); - } else { - /* File */ - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->stat, - loc); - if (need_break) - break; - } - } - - return 0; -} - -/** - * unify_access_cbk - - */ -int32_t -unify_access_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -/** - * unify_access - Send request to only namespace, which has all the - * attributes set for the file. - */ -int32_t -unify_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - STACK_WIND (frame, - unify_access_cbk, - NS(this), - NS(this)->fops->access, - loc, - mask); - - return 0; -} - -int32_t -unify_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - inode_t *tmp_inode = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if ((op_ret == -1) && !(priv->optimist && - (op_errno == ENOENT || - op_errno == EEXIST))) { - /* TODO: Decrement the inode_generation of - * this->inode's parent inode, hence the missing - * directory is created properly by self-heal. - * Currently, there is no way to get the parent - * inode directly. - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - if (op_errno != EEXIST) - local->failed = 1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) - local->op_ret = 0; - - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (!local->failed) { - inode_ctx_put (local->loc1.inode, this, - priv->inode_generation); - } - - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - tmp_inode, &local->stbuf); - } - - return 0; -} - -/** - * unify_ns_mkdir_cbk - - */ -int32_t -unify_ns_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf) -{ - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - long index = 0; - - if (op_ret == -1) { - /* No need to send mkdir request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->name, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, NULL); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->stbuf = *buf; - - local->call_count = priv->child_count; - - /* Send mkdir request to all the nodes now */ - for (index = 0; index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_mkdir_cbk, - (void *)index, //cookie - priv->xl_array[index], - priv->xl_array[index]->fops->mkdir, - &local->loc1, - local->mode); - } - - return 0; -} - - -/** - * unify_mkdir - - */ -int32_t -unify_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - - loc_copy (&local->loc1, loc); - - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_mkdir_cbk, - NS(this), - NS(this)->fops->mkdir, - loc, - mode); - return 0; -} - -/** - * unify_rmdir_cbk - - */ -int32_t -unify_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT))) - local->op_ret = 0; - if (op_ret == -1) - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_ns_rmdir_cbk - - */ -int32_t -unify_ns_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* No need to send rmdir request to other servers, - * as namespace action failed - */ - gf_log (this->name, - ((op_errno != ENOTEMPTY) ? - GF_LOG_ERROR : GF_LOG_DEBUG), - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno); - return 0; - } - - local->call_count = priv->child_count; - - for (index = 0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_rmdir_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->rmdir, - &local->loc1); - } - - return 0; -} - -/** - * unify_rmdir - - */ -int32_t -unify_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; - } - - STACK_WIND (frame, - unify_ns_rmdir_cbk, - NS(this), - NS(this)->fops->rmdir, - loc); - - return 0; -} - -/** - * unify_open_cbk - - */ -int32_t -unify_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - if (NS(this) != (xlator_t *)cookie) { - /* Store child node's ptr, used in - all the f*** / FileIO calls */ - fd_ctx_set (fd, this, (uint64_t)(long)cookie); - } - } - if (op_ret == -1) { - local->op_errno = op_errno; - local->failed = 1; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if ((local->failed == 1) && (local->op_ret >= 0)) { - local->call_count = 1; - /* return -1 to user */ - local->op_ret = -1; - //local->op_errno = EIO; - - if (!fd_ctx_get (local->fd, this, NULL)) { - gf_log (this->name, GF_LOG_ERROR, - "Open success on child node, " - "failed on namespace"); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Open success on namespace, " - "failed on child node"); - } - } - - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - } - - return 0; -} - -#ifdef GF_DARWIN_HOST_OS -/** - * unify_create_lookup_cbk - - */ -int32_t -unify_open_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf, - dict_t *dict) -{ - int32_t callcnt = 0; - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->index++; - if (NS(this) == priv->xl_array[(long)cookie]) { - local->list[0] = (int16_t)(long)cookie; - } else { - local->list[1] = (int16_t)(long)cookie; - } - if (S_ISDIR (buf->st_mode)) - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - int16_t file_list[3] = {0,}; - local->op_ret = -1; - - file_list[0] = local->list[0]; - file_list[1] = local->list[1]; - file_list[2] = -1; - - if (local->index != 2) { - /* Lookup failed, can't do open */ - gf_log (this->name, GF_LOG_ERROR, - "%s: present on %d nodes", - local->name, local->index); - - if (local->index < 2) { - unify_local_wipe (local); - gf_log (this->name, GF_LOG_ERROR, - "returning as file found on less " - "than 2 nodes"); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - return 0; - } - } - - if (local->failed) { - /* Open on directory, return EISDIR */ - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EISDIR, local->fd); - return 0; - } - - /* Everything is perfect :) */ - local->call_count = 2; - - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_open_cbk, - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - &local->loc1, - local->flags, - local->fd); - if (need_break) - break; - } - } - - return 0; -} - - -int32_t -unify_open_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path) -{ - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - STACK_UNWIND (frame, -1, ENOENT); - return 0; - } - - if (path[0] == '/') { - local->name = strdup (path); - ERR_ABORT (local->name); - } else { - char *tmp_str = strdup (local->loc1.path); - char *tmp_base = dirname (tmp_str); - local->name = CALLOC (1, ZR_PATH_MAX); - strcpy (local->name, tmp_base); - strncat (local->name, "/", 1); - strcat (local->name, path); - FREE (tmp_str); - } - - local->list = CALLOC (1, sizeof (int16_t) * 3); - ERR_ABORT (local->list); - local->call_count = priv->child_count + 1; - local->op_ret = -1; - for (index = 0; index <= priv->child_count; index++) { - /* Send the lookup to all the nodes including namespace */ - STACK_WIND_COOKIE (frame, - unify_open_lookup_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - &local->loc1, - NULL); - } - - return 0; -} -#endif /* GF_DARWIN_HOST_OS */ - -/** - * unify_open - - */ -int32_t -unify_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - fd_t *fd) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int16_t file_list[3] = {0,}; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Init */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->fd = fd; - local->flags = flags; - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - local->list = list; - file_list[0] = priv->child_count; /* Thats namespace */ - file_list[2] = -1; - for (index = 0; list[index] != -1; index++) { - local->call_count++; - if (list[index] != priv->child_count) - file_list[1] = list[index]; - } - - if (local->call_count != 2) { - /* If the lookup was done for file */ - gf_log (this->name, GF_LOG_ERROR, - "%s: entry_count is %d", - loc->path, local->call_count); - for (index = 0; local->list[index] != -1; index++) - gf_log (this->name, GF_LOG_ERROR, "%s: found on %s", - loc->path, priv->xl_array[list[index]]->name); - - if (local->call_count < 2) { - gf_log (this->name, GF_LOG_ERROR, - "returning EIO as file found on onlyone node"); - STACK_UNWIND (frame, -1, EIO, fd); - return 0; - } - } - -#ifdef GF_DARWIN_HOST_OS - /* Handle symlink here */ - if (S_ISLNK (loc->inode->st_mode)) { - /* Callcount doesn't matter here */ - STACK_WIND (frame, - unify_open_readlink_cbk, - NS(this), - NS(this)->fops->readlink, - loc, ZR_PATH_MAX); - return 0; - } -#endif /* GF_DARWIN_HOST_OS */ - - local->call_count = 2; - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_open_cbk, - priv->xl_array[file_list[index]], //cookie - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - loc, - flags, - fd); - if (need_break) - break; - } - - return 0; -} - - -int32_t -unify_create_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - unify_local_t *local = frame->local; - inode_t *inode = local->loc1.inode; - - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, - inode, &local->stbuf); - - return 0; -} - -/** - * unify_create_open_cbk - - */ -int32_t -unify_create_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int ret = 0; - int32_t callcnt = 0; - unify_local_t *local = frame->local; - inode_t *inode = NULL; - xlator_t *child = NULL; - uint64_t tmp_value = 0; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - if (NS(this) != (xlator_t *)cookie) { - /* Store child node's ptr, used in all - the f*** / FileIO calls */ - /* TODO: log on failure */ - ret = fd_ctx_get (fd, this, &tmp_value); - cookie = (void *)(long)tmp_value; - } else { - /* NOTE: open successful on namespace. - * fd's ctx can be used to identify open - * failure on storage subvolume. cool - * ide ;) */ - local->failed = 0; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - ((xlator_t *)cookie)->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed == 1 && (local->op_ret >= 0)) { - local->call_count = 1; - /* return -1 to user */ - local->op_ret = -1; - local->op_errno = EIO; - local->fd = fd; - local->call_count = 1; - - if (!fd_ctx_get (local->fd, this, &tmp_value)) { - child = (xlator_t *)(long)tmp_value; - - gf_log (this->name, GF_LOG_ERROR, - "Create success on child node, " - "failed on namespace"); - - STACK_WIND (frame, - unify_create_unlink_cbk, - child, - child->fops->unlink, - &local->loc1); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Create success on namespace, " - "failed on child node"); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - } - return 0; - } - inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, fd, - inode, &local->stbuf); - } - return 0; -} - -/** - * unify_create_lookup_cbk - - */ -int32_t -unify_create_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf, - dict_t *dict) -{ - int32_t callcnt = 0; - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->list[local->index++] = (int16_t)(long)cookie; - if (NS(this) == priv->xl_array[(long)cookie]) { - local->st_ino = buf->st_ino; - } else { - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - int16_t *list = local->list; - int16_t file_list[3] = {0,}; - local->op_ret = -1; - - local->list [local->index] = -1; - file_list[0] = list[0]; - file_list[1] = list[1]; - file_list[2] = -1; - - local->stbuf.st_ino = local->st_ino; - /* TODO: log on failure */ - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->list); - - if (local->index != 2) { - /* Lookup failed, can't do open */ - gf_log (this->name, GF_LOG_ERROR, - "%s: present on %d nodes", - local->loc1.path, local->index); - file_list[0] = priv->child_count; - for (index = 0; list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", local->loc1.path, - priv->xl_array[list[index]]->name); - if (list[index] != priv->child_count) - file_list[1] = list[index]; - } - - if (local->index < 2) { - unify_local_wipe (local); - gf_log (this->name, GF_LOG_ERROR, - "returning EIO as file found on " - "only one node"); - STACK_UNWIND (frame, -1, EIO, - local->fd, inode, NULL); - return 0; - } - } - /* Everything is perfect :) */ - local->call_count = 2; - - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_create_open_cbk, - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - &local->loc1, - local->flags, - local->fd); - if (need_break) - break; - } - } - - return 0; -} - - -/** - * unify_create_cbk - - */ -int32_t -unify_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct stat *buf) -{ - int ret = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - inode_t *tmp_inode = NULL; - - if (op_ret == -1) { - /* send unlink () on Namespace */ - local->op_errno = op_errno; - local->op_ret = -1; - local->call_count = 1; - gf_log (this->name, GF_LOG_ERROR, - "create failed on %s (file %s, error %s), " - "sending unlink to namespace", - prev_frame->this->name, - local->loc1.path, strerror (op_errno)); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->stbuf = *buf; - /* Just inode number should be from NS node */ - local->stbuf.st_ino = local->st_ino; - - /* TODO: log on failure */ - ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this); - } - - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, - tmp_inode, &local->stbuf); - - return 0; -} - -/** - * unify_ns_create_cbk - - * - */ -int32_t -unify_ns_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct stat *buf) -{ - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send create request to other servers, as - namespace action failed. Handle exclusive create here. */ - if ((op_errno != EEXIST) || - ((op_errno == EEXIST) && - ((local->flags & O_EXCL) == O_EXCL))) { - /* If its just a create call without O_EXCL, - don't do this */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); - return 0; - } - } - - if (op_ret >= 0) { - /* Get the inode number from the NS node */ - local->st_ino = buf->st_ino; - - local->op_ret = -1; - - /* Start the mapping list */ - list = CALLOC (1, sizeof (int16_t) * 3); - ERR_ABORT (list); - inode_ctx_put (inode, this, (uint64_t)(long)list); - list[0] = priv->child_count; - list[2] = -1; - - /* This means, file doesn't exist anywhere in the Filesystem */ - sched_ops = priv->sched_ops; - - /* Send create request to the scheduled node now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (sched_xl == NULL) - { - /* send unlink () on Namespace */ - local->op_errno = ENOTCONN; - local->op_ret = -1; - local->call_count = 1; - gf_log (this->name, GF_LOG_ERROR, - "no node online to schedule create:(file %s) " - "sending unlink to namespace", - (local->loc1.path)?local->loc1.path:""); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, unify_create_cbk, - sched_xl, sched_xl->fops->create, - &local->loc1, local->flags, local->mode, fd); - } else { - /* File already exists, and there is no O_EXCL flag */ - - gf_log (this->name, GF_LOG_DEBUG, - "File(%s) already exists on namespace, sending " - "open instead", local->loc1.path); - - local->list = CALLOC (1, sizeof (int16_t) * 3); - ERR_ABORT (local->list); - local->call_count = priv->child_count + 1; - local->op_ret = -1; - for (index = 0; index <= priv->child_count; index++) { - /* Send lookup() to all nodes including namespace */ - STACK_WIND_COOKIE (frame, - unify_create_lookup_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - &local->loc1, - NULL); - } - } - return 0; -} - -/** - * unify_create - create a file in global namespace first, so other - * clients can see them. Create the file in storage nodes in background. - */ -int32_t -unify_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, - fd_t *fd) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - local->flags = flags; - local->fd = fd; - - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_create_cbk, - NS(this), - NS(this)->fops->create, - loc, - flags | O_EXCL, - mode, - fd); - - return 0; -} - - -/** - * unify_opendir_cbk - - */ -int32_t -unify_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd); - - return 0; -} - -/** - * unify_opendir - - */ -int32_t -unify_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - fd_t *fd) -{ - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - STACK_WIND (frame, unify_opendir_cbk, - NS(this), NS(this)->fops->opendir, loc, fd); - - return 0; -} - - -/** - * unify_chmod - - */ -int32_t -unify_chmod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, loc); - local->st_ino = loc->inode->ino; - - if (S_ISDIR (loc->inode->st_mode)) { - local->call_count = priv->child_count + 1; - - for (index = 0; index < (priv->child_count + 1); index++) { - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->chmod, - loc, mode); - } - } else { - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - for (index = 0; local->list[index] != -1; index++) { - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->chmod, - loc, - mode); - if (!--callcnt) - break; - } - } - - return 0; -} - -/** - * unify_chown - - */ -int32_t -unify_chown (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - uid_t uid, - gid_t gid) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->st_ino = loc->inode->ino; - - if (S_ISDIR (loc->inode->st_mode)) { - local->call_count = priv->child_count + 1; - - for (index = 0; index < (priv->child_count + 1); index++) { - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->chown, - loc, uid, gid); - } - } else { - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - for (index = 0; local->list[index] != -1; index++) { - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->chown, - loc, uid, gid); - if (!--callcnt) - break; - } - } - - return 0; -} - - -/** - * unify_truncate_cbk - - */ -int32_t -unify_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - local->op_errno = op_errno; - if (!((op_errno == ENOENT) && priv->optimist)) - local->op_ret = -1; - } - - if (op_ret >= 0) { - if (NS (this) == prev_frame->this) { - local->st_ino = buf->st_ino; - /* If the entry is directory, get the - stat from NS node */ - if (S_ISDIR (buf->st_mode) || - !local->stbuf.st_blksize) { - local->stbuf = *buf; - } - } - - if ((!S_ISDIR (buf->st_mode)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from - Storage node. */ - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->st_ino) - local->stbuf.st_ino = local->st_ino; - else - local->op_ret = -1; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - - return 0; -} - -/** - * unify_truncate - - */ -int32_t -unify_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->st_ino = loc->inode->ino; - - if (S_ISDIR (loc->inode->st_mode)) { - local->call_count = 1; - - STACK_WIND (frame, - unify_buf_cbk, - NS(this), - NS(this)->fops->stat, - loc); - } else { - local->op_ret = 0; - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - /* Don't send truncate to NS node */ - STACK_WIND (frame, unify_truncate_cbk, NS(this), - NS(this)->fops->stat, loc); - callcnt--; - - for (index = 0; local->list[index] != -1; index++) { - if (NS(this) != priv->xl_array[local->list[index]]) { - STACK_WIND (frame, - unify_truncate_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->truncate, - loc, - offset); - if (!--callcnt) - break; - } - } - } - - return 0; -} - -/** - * unify_utimens - - */ -int32_t -unify_utimens (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct timespec tv[2]) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->st_ino = loc->inode->ino; - - if (S_ISDIR (loc->inode->st_mode)) { - local->call_count = priv->child_count + 1; - - for (index = 0; index < (priv->child_count + 1); index++) { - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->utimens, - loc, tv); - } - } else { - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - for (index = 0; local->list[index] != -1; index++) { - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->utimens, - loc, - tv); - if (!--callcnt) - break; - } - } - - return 0; -} - -/** - * unify_readlink_cbk - - */ -int32_t -unify_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path) -{ - STACK_UNWIND (frame, op_ret, op_errno, path); - return 0; -} - -/** - * unify_readlink - Read the link only from the storage node. - */ -int32_t -unify_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - unify_private_t *priv = this->private; - int32_t entry_count = 0; - int16_t *list = NULL; - int16_t index = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - entry_count++; - - if (entry_count >= 2) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_readlink_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->readlink, - loc, - size); - break; - } - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "returning ENOENT, no softlink files found " - "on storage node"); - STACK_UNWIND (frame, -1, ENOENT, NULL); - } - - return 0; -} - - -/** - * unify_unlink_cbk - - */ -int32_t -unify_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == 0 || ((op_errno == ENOENT) && priv->optimist)) - local->op_ret = 0; - if (op_ret == -1) - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - - -/** - * unify_unlink - - */ -int32_t -unify_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND (frame, - unify_unlink_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->unlink, - loc); - if (need_break) - break; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "%s: returning ENOENT", loc->path); - STACK_UNWIND (frame, -1, ENOENT); - } - - return 0; -} - - -/** - * unify_readv_cbk - - */ -int32_t -unify_readv_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iovec *vector, - int32_t count, - struct stat *stbuf, - struct iobref *iobref) -{ - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); - return 0; -} - -/** - * unify_readv - - */ -int32_t -unify_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, - unify_readv_cbk, - child, - child->fops->readv, - fd, - size, - offset); - - - return 0; -} - -/** - * unify_writev_cbk - - */ -int32_t -unify_writev_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *stbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, stbuf); - return 0; -} - -/** - * unify_writev - - */ -int32_t -unify_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, - struct iobref *iobref) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, - unify_writev_cbk, - child, - child->fops->writev, - fd, - vector, - count, - off, - iobref); - - return 0; -} - -/** - * unify_ftruncate - - */ -int32_t -unify_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - xlator_t *child = NULL; - unify_local_t *local = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->op_ret = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_truncate_cbk, - child, child->fops->ftruncate, - fd, offset); - - STACK_WIND (frame, unify_truncate_cbk, - NS(this), NS(this)->fops->fstat, - fd); - - return 0; -} - - -/** - * unify_fchmod - - */ -int32_t -unify_fchmod (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - mode_t mode) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->st_ino = fd->inode->ino; - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_buf_cbk, child, - child->fops->fchmod, fd, mode); - - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fchmod, fd, mode); - - } else { - /* this is an directory */ - local->call_count = 1; - - STACK_WIND (frame, unify_buf_cbk, - NS(this), NS(this)->fops->fchmod, fd, mode); - } - - return 0; -} - -/** - * unify_fchown - - */ -int32_t -unify_fchown (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - uid_t uid, - gid_t gid) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->st_ino = fd->inode->ino; - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_buf_cbk, child, - child->fops->fchown, fd, uid, gid); - - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fchown, fd, uid, gid); - } else { - local->call_count = 1; - - STACK_WIND (frame, unify_buf_cbk, - NS(this), NS(this)->fops->fchown, - fd, uid, gid); - } - - return 0; -} - -/** - * unify_flush_cbk - - */ -int32_t -unify_flush_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_flush - - */ -int32_t -unify_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_flush_cbk, child, - child->fops->flush, fd); - - return 0; -} - - -/** - * unify_fsync_cbk - - */ -int32_t -unify_fsync_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_fsync - - */ -int32_t -unify_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fsync_cbk, child, - child->fops->fsync, fd, flags); - - return 0; -} - -/** - * unify_fstat - Send fstat FOP to Namespace only if its directory, and to - * both namespace and the storage node if its a file. - */ -int32_t -unify_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - INIT_LOCAL (frame, local); - local->st_ino = fd->inode->ino; - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - local->call_count = 2; - - STACK_WIND (frame, unify_buf_cbk, child, - child->fops->fstat, fd); - - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fstat, fd); - - } else { - /* this is an directory */ - local->call_count = 1; - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fstat, fd); - } - - return 0; -} - -/** - * unify_getdents_cbk - - */ -int32_t -unify_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - STACK_UNWIND (frame, op_ret, op_errno, entry, count); - return 0; -} - -/** - * unify_getdents - send the FOP request to all the nodes. - */ -int32_t -unify_getdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset, - int32_t flag) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_getdents_cbk, NS(this), - NS(this)->fops->getdents, fd, size, offset, flag); - - return 0; -} - - -/** - * unify_readdir_cbk - - */ -int32_t -unify_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - gf_dirent_t *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - - return 0; -} - -/** - * unify_readdir - send the FOP request to all the nodes. - */ -int32_t -unify_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_readdir_cbk, NS(this), - NS(this)->fops->readdir, fd, size, offset); - - return 0; -} - - -/** - * unify_fsyncdir_cbk - - */ -int32_t -unify_fsyncdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -} - -/** - * unify_fsyncdir - - */ -int32_t -unify_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_fsyncdir_cbk, - NS(this), NS(this)->fops->fsyncdir, fd, flags); - - return 0; -} - -/** - * unify_lk_cbk - UNWIND frame with the proper return arguments. - */ -int32_t -unify_lk_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct flock *lock) -{ - STACK_UNWIND (frame, op_ret, op_errno, lock); - return 0; -} - -/** - * unify_lk - Send it to all the storage nodes, (should be 1) which has file. - */ -int32_t -unify_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct flock *lock) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_lk_cbk, child, - child->fops->lk, fd, cmd, lock); - - return 0; -} - - -int32_t -unify_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno); - -static int32_t -unify_setxattr_file_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - unify_private_t *private = this->private; - unify_local_t *local = frame->local; - xlator_t *sched_xl = NULL; - struct sched_ops *sched_ops = NULL; - - if (op_ret == -1) { - if (!ENOTSUP) - gf_log (this->name, GF_LOG_ERROR, - "setxattr with XATTR_CREATE on ns: " - "path(%s) key(%s): %s", - local->loc1.path, local->name, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno); - return 0; - } - - LOCK (&frame->lock); - { - local->failed = 0; - local->op_ret = 0; - local->op_errno = 0; - local->call_count = 1; - } - UNLOCK (&frame->lock); - - /* schedule XATTR_CREATE on one of the child node */ - sched_ops = private->sched_ops; - - /* Send create request to the scheduled node now */ - sched_xl = sched_ops->schedule (this, local->name); - if (!sched_xl) { - STACK_UNWIND (frame, -1, ENOTCONN); - return 0; - } - - STACK_WIND (frame, - unify_setxattr_cbk, - sched_xl, - sched_xl->fops->setxattr, - &local->loc1, - local->dict, - local->flags); - return 0; -} - -/** - * unify_setxattr_cbk - When all the child nodes return, UNWIND frame. - */ -int32_t -unify_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - dict_t *dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, (((op_errno == ENOENT) || - (op_errno == ENOTSUP))? - GF_LOG_DEBUG : GF_LOG_ERROR), - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - if (local->failed == -1) { - local->failed = 1; - } - local->op_errno = op_errno; - } else { - local->failed = 0; - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed && local->name && - ZR_FILE_CONTENT_REQUEST(local->name)) { - dict = get_new_dict (); - dict_set (dict, local->dict->members_list->key, - data_from_dynptr(NULL, 0)); - dict_ref (dict); - - local->call_count = 1; - - STACK_WIND (frame, - unify_setxattr_file_cbk, - NS(this), - NS(this)->fops->setxattr, - &local->loc1, - dict, - XATTR_CREATE); - - dict_unref (dict); - return 0; - } - - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_sexattr - This function should be sent to all the storage nodes, - * which contains the file, (excluding namespace). - */ -int32_t -unify_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int32_t call_count = 0; - uint64_t tmp_list = 0; - data_pair_t *trav = dict->members_list; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->failed = -1; - loc_copy (&local->loc1, loc); - - if (S_ISDIR (loc->inode->st_mode)) { - - if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) { - /* direct the storage xlators to change file - content only if file exists */ - local->flags = flags; - local->dict = dict; - local->name = strdup (trav->key); - flags |= XATTR_REPLACE; - } - - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_setxattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->setxattr, - loc, dict, flags); - } - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - call_count++; - } - } - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_setxattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->setxattr, - loc, - dict, - flags); - if (!--call_count) - break; - } - } - return 0; - } - - /* No entry in storage nodes */ - gf_log (this->name, GF_LOG_DEBUG, - "returning ENOENT, file not found on storage node."); - STACK_UNWIND (frame, -1, ENOENT); - - return 0; -} - - -/** - * unify_getxattr_cbk - This function is called from only one child, so, no - * need of any lock or anything else, just send it to above layer - */ -int32_t -unify_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *value) -{ - int32_t callcnt = 0; - dict_t *local_value = NULL; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, - (((op_errno == ENOENT) || - (op_errno == ENODATA) || - (op_errno == ENOTSUP)) ? - GF_LOG_DEBUG : GF_LOG_ERROR), - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - } else { - if (!local->dict) - local->dict = dict_ref (value); - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_value = local->dict; - local->dict = NULL; - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local_value); - - if (local_value) - dict_unref (local_value); - } - - return 0; -} - - -/** - * unify_getxattr - This FOP is sent to only the storage node. - */ -int32_t -unify_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - int16_t count = 0; - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - INIT_LOCAL (frame, local); - - if (S_ISDIR (loc->inode->st_mode)) { - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) - STACK_WIND (frame, - unify_getxattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->getxattr, - loc, - name); - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - count++; - } - } - - if (count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_getxattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->getxattr, - loc, - name); - if (!--count) - break; - } - } - } else { - dict_t *tmp_dict = get_new_dict (); - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning ENODATA, no file found on storage node", - loc->path); - STACK_UNWIND (frame, -1, ENODATA, tmp_dict); - dict_destroy (tmp_dict); - } - - return 0; -} - -/** - * unify_removexattr_cbk - Wait till all the child node returns the call - * and then UNWIND to above layer. - */ -int32_t -unify_removexattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - local->op_errno = op_errno; - if (op_errno != ENOTSUP) - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, - local->loc1.path, strerror (op_errno)); - } else { - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_removexattr - Send it to all the child nodes which has the files. - */ -int32_t -unify_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int32_t call_count = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - if (S_ISDIR (loc->inode->st_mode)) { - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) - STACK_WIND (frame, - unify_removexattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->removexattr, - loc, - name); - - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - call_count++; - } - } - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_removexattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->removexattr, - loc, - name); - if (!--call_count) - break; - } - } - return 0; - } - - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning ENOENT, not found on storage node.", loc->path); - STACK_UNWIND (frame, -1, ENOENT); - - return 0; -} - - -int32_t -unify_mknod_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "%s: %s", local->loc1.path, strerror (op_errno)); - - unify_local_wipe (local); - /* No log required here as this -1 is for mknod call */ - STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); - return 0; -} - -/** - * unify_mknod_cbk - - */ -int32_t -unify_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "mknod failed on storage node, sending unlink to " - "namespace"); - local->op_errno = op_errno; - STACK_WIND (frame, - unify_mknod_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - return 0; - } - - local->stbuf = *buf; - local->stbuf.st_ino = local->st_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); - return 0; -} - -/** - * unify_ns_mknod_cbk - - */ -int32_t -unify_ns_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf) -{ - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - call_frame_t *prev_frame = cookie; - - if (op_ret == -1) { - /* No need to send mknod request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, local->loc1.path, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->stbuf = *buf; - local->st_ino = buf->st_ino; - - list = CALLOC (1, sizeof (int16_t) * 3); - ERR_ABORT (list); - list[0] = priv->child_count; - list[2] = -1; - inode_ctx_put (inode, this, (uint64_t)(long)list); - - sched_ops = priv->sched_ops; - - /* Send mknod request to scheduled node now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (!sched_xl) { - gf_log (this->name, GF_LOG_ERROR, - "mknod failed on storage node, no node online " - "at the moment, sending unlink to NS"); - local->op_errno = ENOTCONN; - STACK_WIND (frame, - unify_mknod_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, unify_mknod_cbk, - sched_xl, sched_xl->fops->mknod, - &local->loc1, local->mode, local->dev); - - return 0; -} - -/** - * unify_mknod - Create a device on namespace first, and later create on - * the storage node. - */ -int32_t -unify_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t rdev) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - local->dev = rdev; - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_mknod_cbk, - NS(this), - NS(this)->fops->mknod, - loc, - mode, - rdev); - - return 0; -} - -int32_t -unify_symlink_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - unify_local_t *local = frame->local; - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "%s: %s", local->loc1.path, strerror (op_errno)); - - unify_local_wipe (local); - STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); - return 0; -} - -/** - * unify_symlink_cbk - - */ -int32_t -unify_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* Symlink on storage node failed, hence send unlink - to the NS node */ - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "symlink on storage node failed, sending unlink " - "to namespace"); - - STACK_WIND (frame, - unify_symlink_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - local->stbuf = *buf; - local->stbuf.st_ino = local->st_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); - - return 0; -} - -/** - * unify_ns_symlink_cbk - - */ -int32_t -unify_ns_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf) -{ - - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - int16_t *list = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send symlink request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, NULL, buf); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->st_ino = buf->st_ino; - - /* Start the mapping list */ - - list = CALLOC (1, sizeof (int16_t) * 3); - ERR_ABORT (list); - list[0] = priv->child_count; //namespace's index - list[2] = -1; - inode_ctx_put (inode, this, (uint64_t)(long)list); - - sched_ops = priv->sched_ops; - - /* Send symlink request to all the nodes now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (!sched_xl) { - /* Symlink on storage node failed, hence send unlink - to the NS node */ - local->op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_ERROR, - "symlink on storage node failed, no node online, " - "sending unlink to namespace"); - - STACK_WIND (frame, - unify_symlink_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, - unify_symlink_cbk, - sched_xl, - sched_xl->fops->symlink, - local->name, - &local->loc1); - - return 0; -} - -/** - * unify_symlink - - */ -int32_t -unify_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkpath, - loc_t *loc) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->name = strdup (linkpath); - - if ((local->name == NULL) || - (local->loc1.path == NULL)) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_symlink_cbk, - NS(this), - NS(this)->fops->symlink, - linkpath, - loc); - - return 0; -} - - -int32_t -unify_rename_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s -> %s): %s", - prev_frame->this->name, - local->loc1.path, local->loc2.path, - strerror (op_errno)); - - } - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->stbuf.st_ino = local->st_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - return 0; -} - -int32_t -unify_ns_rename_undo_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - } - - local->stbuf.st_ino = local->st_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); - return 0; -} - -int32_t -unify_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - int32_t index = 0; - int32_t callcnt = 0; - int16_t *list = NULL; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (!S_ISDIR (buf->st_mode)) - local->stbuf = *buf; - local->op_ret = op_ret; - } else { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s -> %s): %s", - prev_frame->this->name, - local->loc1.path, local->loc2.path, - strerror (op_errno)); - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->stbuf.st_ino = local->st_ino; - if (S_ISDIR (local->loc1.inode->st_mode)) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); - return 0; - } - - if (local->op_ret == -1) { - /* TODO: check this logic */ - - /* Rename failed in storage node, successful on NS, - * hence, rename back the entries in NS */ - /* NOTE: this will be done only if the destination - * doesn't exists, if the destination exists, the - * job of correcting NS is left to self-heal - */ - if (!local->index) { - loc_t tmp_oldloc = { - /* its actual 'newloc->path' */ - .path = local->loc2.path, - .inode = local->loc1.inode, - .parent = local->loc2.parent - }; - - loc_t tmp_newloc = { - /* Actual 'oldloc->path' */ - .path = local->loc1.path, - .parent = local->loc1.parent - }; - - gf_log (this->name, GF_LOG_ERROR, - "rename succussful on namespace, on " - "stroage node failed, reverting back"); - - STACK_WIND (frame, - unify_ns_rename_undo_cbk, - NS(this), - NS(this)->fops->rename, - &tmp_oldloc, - &tmp_newloc); - return 0; - } - } else { - /* Rename successful on storage nodes */ - - int32_t idx = 0; - int16_t *tmp_list = NULL; - uint64_t tmp_list_int64 = 0; - if (local->loc2.inode) { - inode_ctx_get (local->loc2.inode, - this, &tmp_list_int64); - list = (int16_t *)(long)tmp_list_int64; - - } - - if (list) { - for (index = 0; list[index] != -1; index++); - tmp_list = CALLOC (1, index * 2); - memcpy (tmp_list, list, index * 2); - - for (index = 0; list[index] != -1; index++) { - /* TODO: Check this logic. */ - /* If the destination file exists in - * the same storage node where we sent - * 'rename' call, no need to send - * unlink - */ - for (idx = 0; - local->list[idx] != -1; idx++) { - if (tmp_list[index] == local->list[idx]) { - tmp_list[index] = priv->child_count; - continue; - } - } - - if (NS(this) != priv->xl_array[tmp_list[index]]) { - local->call_count++; - callcnt++; - } - } - - if (local->call_count) { - if (callcnt > 1) - gf_log (this->name, - GF_LOG_ERROR, - "%s->%s: more (%d) " - "subvolumes have the " - "newloc entry", - local->loc1.path, - local->loc2.path, - callcnt); - - for (index=0; - tmp_list[index] != -1; index++) { - if (NS(this) != priv->xl_array[tmp_list[index]]) { - STACK_WIND (frame, - unify_rename_unlink_cbk, - priv->xl_array[tmp_list[index]], - priv->xl_array[tmp_list[index]]->fops->unlink, - &local->loc2); - if (!--callcnt) - break; - } - } - - FREE (tmp_list); - return 0; - } - if (tmp_list) - FREE (tmp_list); - } - } - - /* Need not send 'unlink' to storage node */ - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, &local->stbuf); - } - - return 0; -} - -int32_t -unify_ns_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - int32_t index = 0; - int32_t callcnt = 0; - int16_t *list = NULL; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* Free local->new_inode */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; - } - - local->stbuf = *buf; - local->st_ino = buf->st_ino; - - /* Everything is fine. */ - if (S_ISDIR (buf->st_mode)) { - local->call_count = priv->child_count; - for (index=0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_rename_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->rename, - &local->loc1, - &local->loc2); - } - - return 0; - } - - local->call_count = 0; - /* send rename */ - list = local->list; - for (index=0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - callcnt++; - } - } - - if (local->call_count) { - for (index=0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - STACK_WIND (frame, - unify_rename_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->rename, - &local->loc1, - &local->loc2); - if (!--callcnt) - break; - } - } - } else { - /* file doesn't seem to be present in storage nodes */ - gf_log (this->name, GF_LOG_CRITICAL, - "CRITICAL: source file not in storage node, " - "rename successful on namespace :O"); - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EIO, NULL); - } - return 0; -} - - -/** - * unify_rename - One of the tricky function. The deadliest of all :O - */ -int32_t -unify_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, oldloc); - loc_copy (&local->loc2, newloc); - - if ((local->loc1.path == NULL) || - (local->loc2.path == NULL)) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - - inode_ctx_get (oldloc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - STACK_WIND (frame, - unify_ns_rename_cbk, - NS(this), - NS(this)->fops->rename, - oldloc, - newloc); - return 0; -} - -/** - * unify_link_cbk - - */ -int32_t -unify_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf) -{ - unify_local_t *local = frame->local; - - if (op_ret >= 0) - local->stbuf = *buf; - local->stbuf.st_ino = local->st_ino; - - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); - - return 0; -} - -/** - * unify_ns_link_cbk - - */ -int32_t -unify_ns_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct stat *buf) -{ - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - int16_t *list = local->list; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send link request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; - } - - /* Update inode for this entry */ - local->op_ret = 0; - local->st_ino = buf->st_ino; - - /* Send link request to the node now */ - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - if (priv->xl_array[list[index]] != NS (this)) { - STACK_WIND (frame, - unify_link_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->link, - &local->loc1, - &local->loc2); - break; - } - if (need_break) - break; - } - - return 0; -} - -/** - * unify_link - - */ -int32_t -unify_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, oldloc); - loc_copy (&local->loc2, newloc); - - inode_ctx_get (oldloc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - STACK_WIND (frame, - unify_ns_link_cbk, - NS(this), - NS(this)->fops->link, - oldloc, - newloc); - - return 0; -} - - -/** - * unify_checksum_cbk - - */ -int32_t -unify_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *fchecksum, - uint8_t *dchecksum) -{ - STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); - - return 0; -} - -/** - * unify_checksum - - */ -int32_t -unify_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - STACK_WIND (frame, - unify_checksum_cbk, - NS(this), - NS(this)->fops->checksum, - loc, - flag); - - return 0; -} - - -/** - * unify_finodelk_cbk - - */ -int -unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_finodelk - */ -int -unify_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int cmd, struct flock *flock) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_finodelk_cbk, - child, child->fops->finodelk, - volume, fd, cmd, flock); - - return 0; -} - - - -/** - * unify_fentrylk_cbk - - */ -int -unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_fentrylk - */ -int -unify_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) - -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fentrylk_cbk, - child, child->fops->fentrylk, - volume, fd, basename, cmd, type); - - return 0; -} - - - -/** - * unify_fxattrop_cbk - - */ -int -unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; -} - -/** - * unify_fxattrop - */ -int -unify_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fxattrop_cbk, - child, child->fops->fxattrop, - fd, optype, xattr); - - return 0; -} - - -/** - * unify_inodelk_cbk - - */ -int -unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -/** - * unify_inodelk - */ -int -unify_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int cmd, struct flock *flock) -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_inodelk_cbk, - child, child->fops->inodelk, - volume, loc, cmd, flock); - - return 0; -} - - - -/** - * unify_entrylk_cbk - - */ -int -unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_entrylk - */ -int -unify_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) - -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_entrylk_cbk, - child, child->fops->entrylk, - volume, loc, basename, cmd, type); - - return 0; -} - - - -/** - * unify_xattrop_cbk - - */ -int -unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; -} - -/** - * unify_xattrop - */ -int -unify_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_xattrop_cbk, - child, child->fops->xattrop, - loc, optype, xattr); - - return 0; -} - -int -unify_forget (xlator_t *this, - inode_t *inode) -{ - int16_t *list = NULL; - uint64_t tmp_list = 0; - - if (inode->st_mode && (!S_ISDIR(inode->st_mode))) { - inode_ctx_get (inode, this, &tmp_list); - if (tmp_list) { - list = (int16_t *)(long)tmp_list; - FREE (list); - } - } - - return 0; -} - -/** - * notify - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - unify_private_t *priv = this->private; - struct sched_ops *sched = NULL; - - if (!priv) { - return 0; - } - - sched = priv->sched_ops; - if (!sched) { - gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O"); - raise (SIGTERM); - return 0; - } - if (priv->namespace == data) { - if (event == GF_EVENT_CHILD_UP) { - sched->notify (this, event, data); - } - return 0; - } - - switch (event) - { - case GF_EVENT_CHILD_UP: - { - /* Call scheduler's update () to enable it for scheduling */ - sched->notify (this, event, data); - - LOCK (&priv->lock); - { - /* Increment the inode's generation, which is - used for self_heal */ - ++priv->inode_generation; - ++priv->num_child_up; - } - UNLOCK (&priv->lock); - - if (!priv->is_up) { - default_notify (this, event, data); - priv->is_up = 1; - } - } - break; - case GF_EVENT_CHILD_DOWN: - { - /* Call scheduler's update () to disable the child node - * for scheduling - */ - sched->notify (this, event, data); - LOCK (&priv->lock); - { - --priv->num_child_up; - } - UNLOCK (&priv->lock); - - if (priv->num_child_up == 0) { - /* Send CHILD_DOWN to upper layer */ - default_notify (this, event, data); - priv->is_up = 0; - } - } - break; - - default: - { - default_notify (this, event, data); - } - break; - } - - return 0; -} - -/** - * init - This function is called first in the xlator, while initializing. - * All the config file options are checked and appropriate flags are set. - * - * @this - - */ -int32_t -init (xlator_t *this) -{ - int32_t ret = 0; - int32_t count = 0; - data_t *scheduler = NULL; - data_t *data = NULL; - xlator_t *ns_xl = NULL; - xlator_list_t *trav = NULL; - xlator_list_t *xlparent = NULL; - xlator_list_t *parent = NULL; - unify_private_t *_private = NULL; - - /* Check for number of child nodes, if there is no child nodes, exit */ - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "No child nodes specified. check \"subvolumes \" " - "option in volfile"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - /* Check for 'scheduler' in volume */ - scheduler = dict_get (this->options, "scheduler"); - if (!scheduler) { - gf_log (this->name, GF_LOG_ERROR, - "\"option scheduler <x>\" is missing in volfile"); - return -1; - } - - /* Setting "option namespace <node>" */ - data = dict_get (this->options, "namespace"); - if(!data) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace option not specified, Exiting"); - return -1; - } - /* Search namespace in the child node, if found, exit */ - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, data->data) == 0) - break; - trav = trav->next; - } - if (trav) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace node used as a subvolume, Exiting"); - return -1; - } - - /* Search for the namespace node, if found, continue */ - ns_xl = this->next; - while (ns_xl) { - if (strcmp (ns_xl->name, data->data) == 0) - break; - ns_xl = ns_xl->next; - } - if (!ns_xl) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace node not found in volfile, Exiting"); - return -1; - } - - gf_log (this->name, GF_LOG_DEBUG, - "namespace node specified as %s", data->data); - - _private = CALLOC (1, sizeof (*_private)); - ERR_ABORT (_private); - _private->sched_ops = get_scheduler (this, scheduler->data); - if (!_private->sched_ops) { - gf_log (this->name, GF_LOG_CRITICAL, - "Error while loading scheduler. Exiting"); - FREE (_private); - return -1; - } - - if (ns_xl->parents) { - gf_log (this->name, GF_LOG_CRITICAL, - "Namespace node should not be a child of any other node. Exiting"); - FREE (_private); - return -1; - } - - _private->namespace = ns_xl; - - /* update _private structure */ - { - count = 0; - trav = this->children; - /* Get the number of child count */ - while (trav) { - count++; - trav = trav->next; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Child node count is %d", count); - - _private->child_count = count; - if (count == 1) { - /* TODO: Should I error out here? */ - gf_log (this->name, GF_LOG_CRITICAL, - "WARNING: You have defined only one " - "\"subvolumes\" for unify volume. It may not " - "be the desired config, review your volume " - "volfile. If this is how you are testing it," - " you may hit some performance penalty"); - } - - _private->xl_array = CALLOC (1, - sizeof (xlator_t) * (count + 1)); - ERR_ABORT (_private->xl_array); - - count = 0; - trav = this->children; - while (trav) { - _private->xl_array[count++] = trav->xlator; - trav = trav->next; - } - _private->xl_array[count] = _private->namespace; - - /* self-heal part, start with generation '1' */ - _private->inode_generation = 1; - /* Because, Foreground part is tested well */ - _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; - data = dict_get (this->options, "self-heal"); - if (data) { - if (strcasecmp (data->data, "off") == 0) - _private->self_heal = ZR_UNIFY_SELF_HEAL_OFF; - - if (strcasecmp (data->data, "foreground") == 0) - _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; - - if (strcasecmp (data->data, "background") == 0) - _private->self_heal = ZR_UNIFY_BG_SELF_HEAL; - } - - /* optimist - ask bulde for more about it */ - data = dict_get (this->options, "optimist"); - if (data) { - if (gf_string2boolean (data->data, - &_private->optimist) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "optimist excepts only boolean " - "options"); - } - } - - LOCK_INIT (&_private->lock); - } - - /* Now that everything is fine. */ - this->private = (void *)_private; - { - /* Initialize scheduler, if everything else is successful */ - ret = _private->sched_ops->init (this); - if (ret == -1) { - gf_log (this->name, GF_LOG_CRITICAL, - "Initializing scheduler failed, Exiting"); - FREE (_private); - return -1; - } - - ret = 0; - - /* This section is required because some fops may look - * for 'xl->parent' variable - */ - xlparent = CALLOC (1, sizeof (*xlparent)); - xlparent->xlator = this; - if (!ns_xl->parents) { - ns_xl->parents = xlparent; - } else { - parent = ns_xl->parents; - while (parent->next) - parent = parent->next; - parent->next = xlparent; - } - /* Initialize the namespace volume */ - if (!ns_xl->ready) { - ret = xlator_tree_init (ns_xl); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "initializing namespace node failed, " - "Exiting"); - FREE (_private); - return -1; - } - } - } - - /* Tell namespace node that init is done */ - xlator_notify (ns_xl, GF_EVENT_PARENT_UP, this); - - return 0; -} - -/** - * fini - Free all the allocated memory - */ -void -fini (xlator_t *this) -{ - unify_private_t *priv = this->private; - priv->sched_ops->fini (this); - this->private = NULL; - LOCK_DESTROY (&priv->lock); - FREE (priv->xl_array); - FREE (priv); - return; -} - - -struct xlator_fops fops = { - .stat = unify_stat, - .chmod = unify_chmod, - .readlink = unify_readlink, - .mknod = unify_mknod, - .mkdir = unify_mkdir, - .unlink = unify_unlink, - .rmdir = unify_rmdir, - .symlink = unify_symlink, - .rename = unify_rename, - .link = unify_link, - .chown = unify_chown, - .truncate = unify_truncate, - .create = unify_create, - .open = unify_open, - .readv = unify_readv, - .writev = unify_writev, - .statfs = unify_statfs, - .flush = unify_flush, - .fsync = unify_fsync, - .setxattr = unify_setxattr, - .getxattr = unify_getxattr, - .removexattr = unify_removexattr, - .opendir = unify_opendir, - .readdir = unify_readdir, - .fsyncdir = unify_fsyncdir, - .access = unify_access, - .ftruncate = unify_ftruncate, - .fstat = unify_fstat, - .lk = unify_lk, - .fchown = unify_fchown, - .fchmod = unify_fchmod, - .utimens = unify_utimens, - .lookup = unify_lookup, - .getdents = unify_getdents, - .checksum = unify_checksum, - .inodelk = unify_inodelk, - .finodelk = unify_finodelk, - .entrylk = unify_entrylk, - .fentrylk = unify_fentrylk, - .xattrop = unify_xattrop, - .fxattrop = unify_fxattrop -}; - -struct xlator_mops mops = { -}; - -struct xlator_cbks cbks = { - .forget = unify_forget, -}; - -struct volume_options options[] = { - { .key = { "namespace" }, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = { "scheduler" }, - .value = { "alu", "rr", "random", "nufa", "switch" }, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"self-heal"}, - .value = { "foreground", "background", "off" }, - .type = GF_OPTION_TYPE_STR - }, - /* TODO: remove it some time later */ - { .key = {"optimist"}, - .type = GF_OPTION_TYPE_BOOL - }, - - { .key = {NULL} }, -}; diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h deleted file mode 100644 index 965436db5..000000000 --- a/xlators/cluster/unify/src/unify.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#ifndef _UNIFY_H -#define _UNIFY_H - -#include "scheduler.h" -#include "list.h" - -#define MAX_DIR_ENTRY_STRING (32 * 1024) - -#define ZR_UNIFY_SELF_HEAL_OFF 0 -#define ZR_UNIFY_FG_SELF_HEAL 1 -#define ZR_UNIFY_BG_SELF_HEAL 2 - -/* Sometimes one should use completely random numbers.. its good :p */ -#define UNIFY_SELF_HEAL_GETDENTS_COUNT 512 - -#define NS(xl) (((unify_private_t *)xl->private)->namespace) - -/* This is used to allocate memory for local structure */ -#define INIT_LOCAL(fr, loc) \ -do { \ - loc = CALLOC (1, sizeof (unify_local_t)); \ - ERR_ABORT (loc); \ - if (!loc) { \ - STACK_UNWIND (fr, -1, ENOMEM); \ - return 0; \ - } \ - fr->local = loc; \ - loc->op_ret = -1; \ - loc->op_errno = ENOENT; \ -} while (0) - - - -struct unify_private { - /* Update this structure depending on requirement */ - void *scheduler; /* THIS SHOULD BE THE FIRST VARIABLE, - if xlator is using scheduler */ - struct sched_ops *sched_ops; /* Scheduler options */ - xlator_t *namespace; /* ptr to namespace xlator */ - xlator_t **xl_array; - gf_boolean_t optimist; - int16_t child_count; - int16_t num_child_up; - uint8_t self_heal; - uint8_t is_up; - uint64_t inode_generation; - gf_lock_t lock; -}; -typedef struct unify_private unify_private_t; - -struct unify_self_heal_struct { - uint8_t dir_checksum[ZR_FILENAME_MAX]; - uint8_t ns_dir_checksum[ZR_FILENAME_MAX]; - uint8_t file_checksum[ZR_FILENAME_MAX]; - uint8_t ns_file_checksum[ZR_FILENAME_MAX]; - off_t *offset_list; - int *count_list; - dir_entry_t **entry_list; -}; - - -struct _unify_local_t { - int32_t call_count; - int32_t op_ret; - int32_t op_errno; - mode_t mode; - off_t offset; - dev_t dev; - uid_t uid; - gid_t gid; - int32_t flags; - int32_t entry_count; - int32_t count; // dir_entry_t count; - fd_t *fd; - struct stat stbuf; - struct statvfs statvfs_buf; - struct timespec tv[2]; - char *name; - int32_t revalidate; - - ino_t st_ino; - nlink_t st_nlink; - - dict_t *dict; - - int16_t *list; - int16_t *new_list; /* Used only in case of rename */ - int16_t index; - - int32_t failed; - int32_t return_eio; /* Used in case of different st-mode - present for a given path */ - - uint64_t inode_generation; /* used to store the per directory - * inode_generation. Got from inode's ctx - * of directory inodes - */ - - struct unify_self_heal_struct *sh_struct; - loc_t loc1, loc2; -}; -typedef struct _unify_local_t unify_local_t; - -int32_t zr_unify_self_heal (call_frame_t *frame, - xlator_t *this, - unify_local_t *local); - -#endif /* _UNIFY_H */ |
