diff options
author | Krutika Dhananjay <kdhananj@redhat.com> | 2014-11-21 11:47:23 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2015-03-19 10:59:31 -0700 |
commit | 6f389fbb812b384bdd9bf4a20e86930505531996 (patch) | |
tree | 7304eeab269d1a0226274cf439fb4f6c49bfcec9 | |
parent | 32ed7aa5ad7049a9d85c795f997336c0366151a8 (diff) |
features/shard: Introducing sharding translator
Based on the high-level design by Anand V. Avati which can be found @
https://gist.github.com/avati/af04f1030dcf52e16535#sharding-xlator-stripe-20
Still to-do:
* complete implementation of inode write fops - [f]truncate,
zerofill, fallocate, discard
* introduce transaction mechanism in inode write fops
* complete readv
* Handle open with O_TRUNC
* Handle unlinking of all shards during unlink/rename
* Compute total ia_size and ia_blocks in lookup, readdirp, etc
* wind fsync/flush on all shards
Note: Most of the items above are related. Once we come up
with a clean way to determine the last shard/shard count for
a file/file size and the mgmt of sparse regions of the file,
implementing them becomes trivial.
Change-Id: Id871379b53a4a916e4baa2e06f197dd8c0043b0f
BUG: 1200082
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-on: http://review.gluster.org/9841
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | xlators/features/Makefile.am | 2 | ||||
-rw-r--r-- | xlators/features/shard/Makefile.am | 3 | ||||
-rw-r--r-- | xlators/features/shard/src/Makefile.am | 16 | ||||
-rw-r--r-- | xlators/features/shard/src/shard-mem-types.h | 22 | ||||
-rw-r--r-- | xlators/features/shard/src/shard.c | 1791 | ||||
-rw-r--r-- | xlators/features/shard/src/shard.h | 118 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 13 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 11 |
9 files changed, 1976 insertions, 2 deletions
diff --git a/configure.ac b/configure.ac index 16d32db7958..81faaf46549 100644 --- a/configure.ac +++ b/configure.ac @@ -160,6 +160,8 @@ AC_CONFIG_FILES([Makefile xlators/features/snapview-client/src/Makefile xlators/features/upcall/Makefile xlators/features/upcall/src/Makefile + xlators/features/shard/Makefile + xlators/features/shard/src/Makefile xlators/playground/Makefile xlators/playground/template/Makefile xlators/playground/template/src/Makefile diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am index 5f35b37bc8e..8dc7051a303 100644 --- a/xlators/features/Makefile.am +++ b/xlators/features/Makefile.am @@ -1,5 +1,5 @@ SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier arbiter\ protect compress changelog changetimerecorder ganesha gfid-access $(GLUPY_SUBDIR) qemu-block \ - upcall snapview-client snapview-server trash #path-converter # filter + upcall snapview-client snapview-server trash shard #path-converter # filter CLEANFILES = diff --git a/xlators/features/shard/Makefile.am b/xlators/features/shard/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/shard/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/shard/src/Makefile.am b/xlators/features/shard/src/Makefile.am new file mode 100644 index 00000000000..b589cafe924 --- /dev/null +++ b/xlators/features/shard/src/Makefile.am @@ -0,0 +1,16 @@ +xlator_LTLIBRARIES = shard.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +shard_la_LDFLAGS = -module -avoid-version + +shard_la_SOURCES = shard.c + +shard_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = shard.h shard-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/shard/src/shard-mem-types.h b/xlators/features/shard/src/shard-mem-types.h new file mode 100644 index 00000000000..4bdcbba787c --- /dev/null +++ b/xlators/features/shard/src/shard-mem-types.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __SHARD_MEM_TYPES_H__ +#define __SHARD_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_shard_mem_types_ { + gf_shard_mt_priv_t = gf_common_mt_end + 1, + gf_shard_mt_inode_list, + gf_shard_mt_inode_ctx_t, + gf_shard_mt_iovec, + gf_shard_mt_end +}; +#endif diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c new file mode 100644 index 00000000000..fd003776f4e --- /dev/null +++ b/xlators/features/shard/src/shard.c @@ -0,0 +1,1791 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> + +#include "shard.h" +#include "shard-mem-types.h" +#include "byte-order.h" +#include "defaults.h" + +static gf_boolean_t +__is_shard_dir (uuid_t gfid) +{ + shard_priv_t *priv = THIS->private; + + if (uuid_compare (gfid, priv->dot_shard_gfid) == 0) + return _gf_true; + + return _gf_false; +} + +void +shard_make_block_bname (int block_num, uuid_t gfid, char *buf, size_t len) +{ + char gfid_str[GF_UUID_BUF_SIZE] = {0,}; + + uuid_unparse (gfid, gfid_str); + snprintf (buf, len, "%s.%d", gfid_str, block_num); +} + +void +shard_make_block_abspath (int block_num, uuid_t gfid, char *filepath, + size_t len) +{ + char gfid_str[GF_UUID_BUF_SIZE] = {0,}; + + uuid_unparse (gfid, gfid_str); + snprintf (filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, + block_num); +} + +int +__shard_inode_ctx_get (inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx_p = NULL; + + ret = __inode_ctx_get (inode, this, &ctx_uint); + if (ret == 0) { + *ctx = (shard_inode_ctx_t *) ctx_uint; + return ret; + } + + ctx_p = GF_CALLOC (1, sizeof (*ctx_p), gf_shard_mt_inode_ctx_t); + if (!ctx_p) + return ret; + + ret = __inode_ctx_set (inode, this, (uint64_t *)&ctx_p); + if (ret < 0) { + GF_FREE (ctx_p); + return ret; + } + + *ctx = ctx_p; + + return ret; +} + + +int +__shard_inode_ctx_set (inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_in) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get (inode, this, &ctx); + if (ret) + return ret; + + ctx->block_size = ctx_in->block_size; + ctx->mode = ctx_in->mode; + ctx->rdev = ctx_in->rdev; + + return 0; +} + +int +shard_inode_ctx_set_all (inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_in) +{ + int ret = -1; + + LOCK (&inode->lock); + { + ret = __shard_inode_ctx_set (inode, this, ctx_in); + } + UNLOCK (&inode->lock); + + return ret; + +} + +int +__shard_inode_ctx_get_block_size (inode_t *inode, xlator_t *this, + uint64_t *block_size) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get (inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *) ctx_uint; + + *block_size = ctx->block_size; + + return 0; +} + +int +shard_inode_ctx_get_block_size (inode_t *inode, xlator_t *this, + uint64_t *block_size) +{ + int ret = -1; + + LOCK (&inode->lock); + { + ret = __shard_inode_ctx_get_block_size (inode, this, + block_size); + } + UNLOCK (&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_get_all (inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_out) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get (inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *) ctx_uint; + + ctx_out->block_size = ctx->block_size; + ctx_out->mode = ctx->mode; + ctx_out->rdev = ctx->rdev; + + return 0; +} + +int +shard_inode_ctx_get_all (inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_out) +{ + int ret = -1; + + LOCK (&inode->lock); + { + ret = __shard_inode_ctx_get_all (inode, this, ctx_out); + } + UNLOCK (&inode->lock); + + return ret; +} + +void +shard_local_wipe (shard_local_t *local) +{ + int i = 0; + int count = 0; + + count = local->num_blocks; + + loc_wipe (&local->loc); + loc_wipe (&local->dot_shard_loc); + + if (local->fd) + fd_unref (local->fd); + + if (local->xattr_req) + dict_unref (local->xattr_req); + + for (i = 0; i < count; i++) { + if (local->inode_list[i]) + inode_unref (local->inode_list[i]); + } + + GF_FREE (local->inode_list); + + GF_FREE (local->vector); + if (local->iobref) + iobref_unref (local->iobref); +} + +int +shard_call_count_return (call_frame_t *frame) +{ + int call_count = 0; + shard_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + return call_count; +} + +int +shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + int ret = 0; + uint64_t size = 0; + shard_inode_ctx_t ctx_tmp = {0,}; + + if (op_ret < 0) + goto unwind; + + if ((op_ret == 0) && (!IA_ISDIR (buf->ia_type))) { + ret = dict_get_uint64 (xdata, GF_XATTR_SHARD_BLOCK_SIZE, &size); + if (!ret && size) { + ctx_tmp.block_size = ntoh64 (size); + ctx_tmp.mode = st_mode_from_ia (buf->ia_prot, + buf->ia_type); + ctx_tmp.rdev = buf->ia_rdev; + ret = shard_inode_ctx_set_all (inode, this, &ctx_tmp); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "Failed to " + "set inode ctx for %s", + uuid_utoa (inode->gfid)); + } + } + + /* To-Do: return the call with aggregated values of ia_size and + * ia_blocks + */ + +unwind: + SHARD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, + xdata, postparent); + return 0; +} + +int +shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req) +{ + int ret = -1; + int32_t op_errno = ENOMEM; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + SHARD_ENTRY_FOP_CHECK (loc, op_errno, err); + + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; + + loc_copy (&local->loc, loc); + + local->xattr_req = xattr_req ? dict_ref (xattr_req) : dict_new (); + if (!local->xattr_req) + goto err; + + if ((shard_inode_ctx_get_block_size (loc->inode, this, &block_size) || + !block_size)) { + ret = dict_set_uint64 (local->xattr_req, + GF_XATTR_SHARD_BLOCK_SIZE, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Failed to set dict" + " value: key:%s for path %s", + GF_XATTR_SHARD_BLOCK_SIZE, loc->path); + goto err; + } + } + + STACK_WIND (frame, shard_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, loc, local->xattr_req); + + return 0; + + +err: + SHARD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, + NULL, NULL); + return 0; + +} + +int +shard_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + /* To-Do: Update ia_size and ia_blocks in @buf before presenting it + * to the parent. + */ + + SHARD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +shard_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + STACK_WIND (frame, shard_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + + return 0; +} + +int +shard_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + /* To-Do: Update ia_size and ia_blocks in @buf before presenting it + * to the parent. + */ + SHARD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +shard_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + STACK_WIND (frame, shard_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; +} + +int +shard_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + /*TBD - once the ability to determine the size/number of shards for a + * file in place, this can be filled. + */ + SHARD_STACK_UNWIND (truncate, frame, -1, ENOTCONN, NULL, NULL, NULL); + return 0; +} + +int +shard_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + /*TBD - once the ability to determine the size/number of shards for a + * file in place, this can be filled. + */ + SHARD_STACK_UNWIND (ftruncate, frame, -1, ENOTCONN, NULL, NULL, NULL); + return 0; +} + +int +shard_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + shard_inode_ctx_t ctx_tmp = {0,}; + + local = frame->local; + + if (op_ret == -1) + goto unwind; + + ctx_tmp.block_size = ntoh64 (local->block_size); + ctx_tmp.mode = st_mode_from_ia (buf->ia_prot, buf->ia_type); + ctx_tmp.rdev = buf->ia_rdev; + ret = shard_inode_ctx_set_all (inode, this, &ctx_tmp); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "Failed to set inode ctx " + "for %s", uuid_utoa (inode->gfid)); + +unwind: + SHARD_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + + return 0; +} + +int +shard_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + int ret = -1; + int32_t op_errno = ENOMEM; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->block_size = hton64 (priv->block_size); + + ret = dict_set_static_bin (xdata, GF_XATTR_SHARD_BLOCK_SIZE, + &local->block_size, sizeof (uint64_t)); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Failed to set key: %s " + "on path %s", GF_XATTR_SHARD_BLOCK_SIZE, loc->path); + goto err; + } + + STACK_WIND (frame, shard_mknod_cbk, FIRST_CHILD (this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, + xdata); + return 0; + + +err: + SHARD_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); + return 0; + +} + +int +shard_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + /* To-Do: Unlink all the shards associated with the file */ + + SHARD_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent, + postparent, xdata); + + return 0; +} + +int +shard_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + int op_errno = ENOMEM; + shard_local_t *local = NULL; + + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; + + loc_copy (&local->loc, loc); + local->xflag = xflag; + + if (xdata) + local->xattr_req = dict_ref (xdata); + + STACK_WIND (frame, shard_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + + return 0; +err: + SHARD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; + +} + + +int +shard_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + /* To-Do: When both src and dst names exist, and src is renamed to + * dst, all the shards associated with the dst file must be unlinked. + */ + SHARD_STACK_UNWIND (rename, frame, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); + return 0; +} + +int +shard_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + shard_local_t *local = NULL; + + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; + + STACK_WIND (frame, shard_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; + +err: + SHARD_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; + +} + + +int +shard_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + shard_inode_ctx_t ctx_tmp = {0,}; + + local = frame->local; + + if (op_ret == -1) + goto unwind; + + ctx_tmp.block_size = ntoh64 (local->block_size); + ctx_tmp.mode = st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type); + ctx_tmp.rdev = stbuf->ia_rdev; + ret = shard_inode_ctx_set_all (inode, this, &ctx_tmp); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Failed to set block size " + "for %s in inode ctx", uuid_utoa (inode->gfid)); + goto unwind; + } + +unwind: + SHARD_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + return 0; +} + +int +shard_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + int32_t op_errno = ENOMEM; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->block_size = hton64 (priv->block_size); + + ret = dict_set_static_bin (xdata, GF_XATTR_SHARD_BLOCK_SIZE, + &local->block_size, sizeof (uint64_t)); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Failed to set key: %s " + "on path %s", GF_XATTR_SHARD_BLOCK_SIZE, loc->path); + goto err; + } + + STACK_WIND (frame, shard_create_cbk, FIRST_CHILD (this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, + fd, xdata); + return 0; + +err: + SHARD_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; + +} + +int +shard_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + /* To-Do: Handle open with O_TRUNC under locks */ + SHARD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int +shard_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + STACK_WIND (frame, shard_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +/* - Incomplete - TBD */ +int +shard_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ +/* + int i = 0; + int32_t op_errno = ENOMEM; + uint64_t block_size = 0; + int highest_block = 0; + int num_blocks = 0; + int cur_block = 0; + char shard_abspath[PATH_MAX] = {0,}; + off_t cur_offset = 0; + size_t total_size = 0; + fd_t *cur_fd = NULL; + inode_t *inode = NULL; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get inode ctx for " + "%s", uuid_utoa(fd->inode->gfid)); + goto out; + } + + if (!block_size) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, + flags, xdata); + return 0; + } + + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->block_size = block_size; + local->offset = offset; + local->len = size; + local->first_block = get_lowest_block (offset, block_size); + highest_block = get_highest_block (offset, size, block_size); + num_blocks = local->num_blocks = highest_block - local->first_block + 1; + + while (num_blocks--) { + cur_fd = (local->first_block == 0) ? fd_ref (fd) : + fd_anonymous (inode); + cur_offset = (cur_block == local->first_block) ? + get_shard_offset(offset, block_size):0; + STACK_WIND_COOKIE (frame, shard_readv_cbk, cur_fd, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, cur_fd, + cur_size, cur_offset, flags, xdata); + } + + return 0; + +err: + SHARD_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, + NULL); +*/ + SHARD_STACK_UNWIND (readv, frame, -1, ENOTCONN, NULL, 0, NULL, NULL, + NULL); + return 0; + +} + +int +shard_writev_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int call_count = 0; + fd_t *anon_fd = cookie; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + if (anon_fd) + fd_unref (anon_fd); + + call_count = shard_call_count_return (frame); + if (call_count == 0) + SHARD_STACK_UNWIND (writev, frame, local->op_ret, + local->op_errno, prebuf, postbuf, xdata); + + return 0; +} + +int +shard_writev_do (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int count = 0; + int call_count = 0; + int last_block = 0; + uint32_t cur_block = 0; + fd_t *fd = NULL; + fd_t *anon_fd = NULL; + shard_local_t *local = NULL; + struct iovec *vec = NULL; + gf_boolean_t wind_failed = _gf_false; + off_t orig_offset = 0; + off_t shard_offset = 0; + off_t vec_offset = 0; + size_t remaining_size = 0; + size_t write_size = 0; + + local = frame->local; + fd = local->fd; + + orig_offset = local->offset; + remaining_size = local->total_size; + cur_block = local->first_block; + local->call_count = call_count = local->num_blocks; + last_block = local->last_block; + + dict_del (local->xattr_req, "gfid-req"); + + while (cur_block <= last_block) { + if (wind_failed) { + shard_writev_do_cbk (frame, (void *) (long) 0, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + shard_offset = orig_offset % local->block_size; + write_size = local->block_size - shard_offset; + if (write_size > remaining_size) + write_size = remaining_size; + + remaining_size -= write_size; + + count = iov_subset (local->vector, local->count, vec_offset, + vec_offset + write_size, NULL); + + vec = GF_CALLOC (count, sizeof (struct iovec), + gf_shard_mt_iovec); + if (!vec) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + GF_FREE (vec); + shard_writev_do_cbk (frame, (void *) (long) 0, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + count = iov_subset (local->vector, local->count, vec_offset, + vec_offset + write_size, vec); + + if (cur_block == 0) { + anon_fd = fd_ref (fd); + } else { + anon_fd = fd_anonymous (local->inode_list[i]); + if (!anon_fd) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + GF_FREE (vec); + shard_writev_do_cbk (frame, + (void *) (long) anon_fd, + this, -1, ENOMEM, NULL, + NULL, NULL); + goto next; + } + } + + STACK_WIND_COOKIE (frame, shard_writev_do_cbk, anon_fd, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, anon_fd, + vec, count, shard_offset, local->flags, + local->iobref, local->xattr_req); + GF_FREE (vec); + vec = NULL; + orig_offset += write_size; + vec_offset += write_size; +next: + cur_block++; + i++; + call_count--; + } + return 0; +} + +void +shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + struct iatt *buf) +{ + char block_bname[256] = {0,}; + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; + + priv = THIS->private; + + shard_make_block_bname (block_num, local->fd->inode->gfid, block_bname, + sizeof (block_bname)); + + linked_inode = inode_link (inode, priv->dot_shard_inode, block_bname, + buf); + inode_lookup (linked_inode); + local->inode_list[block_num - local->first_block] = linked_inode; + /* Defer unref'ing the inodes until write is complete to prevent + * them from getting purged. These inodes are unref'd in the event of + * a failure or after successfull fop completion in shard_local_wipe(). + */ +} + +int +shard_writev_lookup_shards_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + int call_count = 0; + int shard_block_num = (long) cookie; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto done; + } + + shard_link_block_inode (local, shard_block_num, inode, buf); + +done: + call_count = shard_call_count_return (frame); + if (call_count == 0) { + if (local->op_ret < 0) + goto unwind; + else + shard_writev_do (frame, this); + } + return 0; + +unwind: + SHARD_STACK_UNWIND (writev, frame, local->op_ret, local->op_errno, NULL, + NULL, NULL); + return 0; +} + +dict_t* +shard_create_gfid_dict (dict_t *dict) +{ + int ret = 0; + dict_t *new = NULL; + uuid_t *gfid = NULL; + + new = dict_copy_with_ref (dict, NULL); + if (!new) + return NULL; + + gfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); + if (!gfid) { + ret = -1; + goto out; + } + + uuid_generate (*gfid); + + ret = dict_set_dynptr (new, "gfid-req", gfid, sizeof (uuid_t)); + +out: + if (ret) { + dict_unref (new); + new = NULL; + GF_FREE (gfid); + } + + return NULL; +} + +int +shard_writev_lookup_shards (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int ret = 0; + int call_count = 0; + int32_t shard_idx_iter = 0; + int last_block = 0; + char path[PATH_MAX] = {0,}; + char *bname = NULL; + loc_t loc = {0,}; + fd_t *fd = NULL; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + gf_boolean_t wind_failed = _gf_false; + dict_t *xattr_req = NULL; + + priv = this->private; + local = frame->local; + fd = local->fd; + call_count = local->call_count; + shard_idx_iter = local->first_block; + last_block = local->last_block; + + while (shard_idx_iter <= last_block) { + if (local->inode_list[i]) { + i++; + shard_idx_iter++; + continue; + } + + if (wind_failed) { + shard_writev_lookup_shards_cbk (frame, + (void *) (long) shard_idx_iter, + this, -1, ENOMEM, NULL, + NULL, NULL, NULL); + goto next; + } + + shard_make_block_abspath (shard_idx_iter, fd->inode->gfid, + path, sizeof(path)); + + bname = strrchr (path, '/') + 1; + loc.inode = inode_new (priv->inode_table); + loc.parent = inode_ref (priv->dot_shard_inode); + ret = inode_path (loc.parent, bname, (char **) &(loc.path)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Inode path failed on" + " %s", bname); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe (&loc); + shard_writev_lookup_shards_cbk (frame, + (void *) (long) shard_idx_iter, + this, -1, ENOMEM, NULL, + NULL, NULL, NULL); + goto next; + } + + loc.name = strrchr (loc.path, '/'); + if (loc.name) + loc.name++; + + xattr_req = shard_create_gfid_dict (local->xattr_req); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + loc_wipe (&loc); + shard_writev_lookup_shards_cbk (frame, + (void *) (long) shard_idx_iter, + this, -1, ENOMEM, NULL, + NULL, NULL, NULL); + goto next; + } + + STACK_WIND_COOKIE (frame, shard_writev_lookup_shards_cbk, + (void *) (long) shard_idx_iter, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &loc, + xattr_req); + loc_wipe (&loc); + dict_unref (xattr_req); +next: + shard_idx_iter++; + i++; + + if (!--call_count) + break; + } + + return 0; +} + +int +shard_writev_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int shard_block_num = (long) cookie; + int call_count = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + if (op_errno == EEXIST) { + local->eexist_count++; + } else { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + gf_log (this->name, GF_LOG_DEBUG, "SHARD WRITEV: mknod of " + "shard %d failed: %s", shard_block_num, + strerror (op_errno)); + goto done; + } + + shard_link_block_inode (local, shard_block_num, inode, buf); + +done: + call_count = shard_call_count_return (frame); + if (call_count == 0) { + if (local->op_ret < 0) { + goto unwind; + } else { + if (!local->eexist_count) { + shard_writev_do (frame, this); + } else { + local->call_count = local->eexist_count; + shard_writev_lookup_shards (frame, this); + } + } + } + return 0; + +unwind: + SHARD_STACK_UNWIND (writev, frame, local->op_ret, local->op_errno, NULL, + NULL, NULL); + return 0; +} + +int +shard_writev_resume_mknod (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int shard_idx_iter = 0; + int last_block = 0; + int ret = 0; + int call_count = 0; + char path[PATH_MAX] = {0,}; + char *bname = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t ctx_tmp = {0,}; + shard_local_t *local = NULL; + gf_boolean_t wind_failed = _gf_false; + fd_t *fd = NULL; + loc_t loc = {0,}; + dict_t *xattr_req = NULL; + + local = frame->local; + priv = this->private; + fd = local->fd; + shard_idx_iter = local->first_block; + last_block = local->last_block; + call_count = local->call_count; + + ret = shard_inode_ctx_get_all (fd->inode, this, &ctx_tmp); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get inode ctx for" + " %s", uuid_utoa (fd->inode->gfid)); + goto err; + } + + while (shard_idx_iter <= last_block) { + if (local->inode_list[i]) { + shard_idx_iter++; + i++; + continue; + } + + if (wind_failed) { + shard_writev_mknod_cbk (frame, + (void *) (long) shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, + NULL, NULL, NULL); + goto next; + } + + shard_make_block_abspath (shard_idx_iter, fd->inode->gfid, + path, sizeof(path)); + + xattr_req = shard_create_gfid_dict (local->xattr_req); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + shard_writev_mknod_cbk (frame, + (void *) (long) shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, + NULL, NULL, NULL); + goto next; + } + + bname = strrchr (path, '/') + 1; + loc.inode = inode_new (priv->inode_table); + loc.parent = inode_ref (priv->dot_shard_inode); + ret = inode_path (loc.parent, bname, + (char **) &(loc.path)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Inode path failed on" + " %s", bname); + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + loc_wipe (&loc); + dict_unref (xattr_req); + shard_writev_mknod_cbk (frame, + (void *) (long) shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, + NULL, NULL, NULL); + goto next; + } + + loc.name = strrchr (loc.path, '/'); + if (loc.name) + loc.name++; + + STACK_WIND_COOKIE (frame, shard_writev_mknod_cbk, + (void *) (long) shard_idx_iter, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, &loc, + ctx_tmp.mode, ctx_tmp.rdev, 0, xattr_req); + loc_wipe (&loc); + dict_unref (xattr_req); + +next: + shard_idx_iter++; + i++; + if (!--call_count) + break; + } + + return 0; +err: + /* + * This block is for handling failure in shard_inode_ctx_get_all(). + * Failures in the while-loop are handled within the loop. + */ + SHARD_STACK_UNWIND (writev, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +int +shard_writev_create_write_shards (call_frame_t *frame, xlator_t *this) +{ + int i = -1; + uint32_t shard_idx_iter = 0; + char path[PATH_MAX] = {0,}; + fd_t *fd = NULL; + inode_t *inode = NULL; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + fd = local->fd; + shard_idx_iter = local->first_block; + + while (shard_idx_iter <= local->last_block) { + i++; + if (shard_idx_iter == 0) { + local->inode_list[i] = inode_ref (fd->inode); + shard_idx_iter++; + continue; + } + + shard_make_block_abspath (shard_idx_iter, fd->inode->gfid, + path, sizeof(path)); + + inode = NULL; + inode = inode_resolve (priv->inode_table, path); + if (inode) { + gf_log (this->name, GF_LOG_DEBUG, "Shard %d already " + "present. gfid=%s. Saving inode for future.", + shard_idx_iter, uuid_utoa(inode->gfid)); + shard_idx_iter++; + local->inode_list[i] = inode; + /* Let the ref on the inodes that are already present + * in inode table still be held so that they don't get + * forgotten by the time the fop reaches the actual + * write stage. + */ + continue; + } + local->call_count++; + } + + if (local->call_count) + shard_writev_resume_mknod (frame, this); + else + shard_writev_do (frame, this); + + return 0; +} + +static void +shard_link_dot_shard_inode (shard_local_t *local, inode_t *inode, + struct iatt *buf) +{ + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; + + priv = THIS->private; + + linked_inode = inode_link (inode, local->dot_shard_loc.parent, + local->dot_shard_loc.name, buf); + inode_lookup (linked_inode); + priv->dot_shard_inode = linked_inode; +} + +int +shard_lookup_dot_shard_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret) + goto unwind; + + shard_link_dot_shard_inode (local, inode, buf); + shard_writev_create_write_shards (frame, this); + return 0; + +unwind: + SHARD_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +shard_lookup_dot_shard (call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + STACK_WIND (frame, shard_lookup_dot_shard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &local->dot_shard_loc, + local->xattr_req); + + return 0; +} + +int +shard_writev_mkdir_dot_shard_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret == -1) { + if (op_errno != EEXIST) { + goto unwind; + } else { + gf_log (this->name, GF_LOG_DEBUG, "mkdir on /.shard " + "failed with EEXIST. Attempting lookup now"); + shard_lookup_dot_shard (frame, this); + return 0; + } + } + + shard_link_dot_shard_inode (local, inode, buf); + shard_writev_create_write_shards (frame, this); + return 0; + +unwind: + SHARD_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +shard_writev_mkdir_dot_shard (call_frame_t *frame, xlator_t *this) +{ + int ret = -1; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + loc_t *dot_shard_loc = NULL; + + local = frame->local; + priv = this->private; + + dot_shard_loc = &local->dot_shard_loc; + + dot_shard_loc->inode = inode_new (priv->inode_table); + dot_shard_loc->parent = inode_ref (priv->inode_table->root); + ret = inode_path (dot_shard_loc->parent, GF_SHARD_DIR, + (char **)&dot_shard_loc->path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Inode path failed on" + " %s", GF_SHARD_DIR); + goto err; + } + + dot_shard_loc->name = strrchr (dot_shard_loc->path, '/'); + if (dot_shard_loc->name) + dot_shard_loc->name++; + + ret = dict_set_static_bin (local->xattr_req, "gfid-req", + priv->dot_shard_gfid, 16); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set gfid-req for " + "/.shard"); + goto err; + } + + STACK_WIND (frame, shard_writev_mkdir_dot_shard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, &local->dot_shard_loc, + 0755, 0, local->xattr_req); + return 0; + +err: + SHARD_STACK_UNWIND (writev, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +/* shard_writev - still a WIP */ +int +shard_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int i = 0; + uint64_t block_size = 0; + uint32_t first_block = 0; + uint32_t last_block = 0; + uint32_t num_blocks = 0; + size_t total_size = 0; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + + if (shard_inode_ctx_get_block_size (fd->inode, this, &block_size)) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get inode ctx for " + "%s", uuid_utoa(fd->inode->gfid)); + goto out; + } + + for (i = 0; i < count; i++) + total_size += vector[i].iov_len; + + first_block = get_lowest_block (offset, block_size); + last_block = get_highest_block (offset, total_size, block_size); + num_blocks = last_block - first_block + 1; + + gf_log (this->name, GF_LOG_TRACE, "gfid=%s first_block=%"PRIu32" " + "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64" " + "total_size=%lu", uuid_utoa (fd->inode->gfid), first_block, + last_block, num_blocks, offset, total_size); + + if (!block_size || + ((first_block == 0) && (first_block == last_block))) { + /* To-Do: Replace default_writev_cbk with a specific cbk + * that would collect total size and block count before unwind + */ + STACK_WIND (frame, default_writev_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + return 0; + } + + local = mem_get0 (this->local_pool); + if (!local) + goto out; + + frame->local = local; + + local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); + if (!local->xattr_req) + goto out; + + local->vector = iov_dup (vector, count); + if (!local->vector) + goto out; + + local->count = count; + local->offset = offset; + local->flags = flags; + local->iobref = iobref_ref (iobref); + local->fd = fd_ref (fd); + local->first_block = first_block; + local->last_block = last_block; + local->total_size = total_size; + local->block_size = block_size; + local->num_blocks = num_blocks; + local->inode_list = GF_CALLOC (num_blocks, sizeof (inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) + goto out; + + local->dot_shard_loc.inode = inode_find (priv->inode_table, + priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) + shard_writev_mkdir_dot_shard (frame, this); + else + shard_writev_create_write_shards (frame, this); + + return 0; +out: + SHARD_STACK_UNWIND (writev, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +int +shard_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + /* To-Do: Wind flush on all shards of the file */ + SHARD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + return 0; +} + +int +shard_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + STACK_WIND (frame, shard_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; +} + +int +shard_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + /* To-Do: Wind fsync on all shards of the file */ + SHARD_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int +shard_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + STACK_WIND (frame, shard_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; +} + +int32_t +shard_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + fd_t *fd = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + shard_local_t *local = NULL; + gf_dirent_t skipped; + + INIT_LIST_HEAD (&skipped.list); + + local = frame->local; + fd = local->fd; + + if (op_ret < 0) + goto unwind; + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !(strcmp (entry->d_name, GF_SHARD_DIR))) { + list_del_init (&entry->list); + list_add_tail (&entry->list, &skipped.list); + break; + } + } + +unwind: + SHARD_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, xdata); + gf_dirent_free (&skipped); + return 0; +} + + +int +shard_readdir_do (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *xdata) +{ + int op_errno = ENOMEM; + shard_local_t *local = NULL; + + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->fd = fd_ref (fd); + + if (whichop == GF_FOP_READDIR) + STACK_WIND (frame, shard_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, + xdata); + else + STACK_WIND (frame, shard_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, + xdata); + + return 0; + +err: + STACK_UNWIND_STRICT (readdir, frame, -1, op_errno, NULL, NULL); + return 0; + +} + + +int32_t +shard_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + shard_readdir_do (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; +} + + +int32_t +shard_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + shard_readdir_do (frame, this, fd, size, offset, GF_FOP_READDIRP, + xdata); + return 0; +} + +int +shard_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + /* To-Do: Call fsetattr on all shards. */ + SHARD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int +shard_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + STACK_WIND (frame, shard_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + + return 0; +} + +int +shard_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + /* To-Do: Call fsetattr on all shards. */ + SHARD_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int +shard_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + STACK_WIND (frame, shard_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + + return 0; +} + +int +shard_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t keep_size, off_t offset, size_t len, dict_t *xdata) +{ + /* TBD */ + SHARD_STACK_UNWIND (fallocate, frame, -1, ENOTCONN, NULL, NULL, NULL); + return 0; +} + +int +shard_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + /* TBD */ + SHARD_STACK_UNWIND (discard, frame, -1, ENOTCONN, NULL, NULL, NULL); + return 0; +} + +int +shard_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + /* TBD */ + SHARD_STACK_UNWIND (zerofill, frame, -1, ENOTCONN, NULL, NULL, NULL); + return 0; +} + +int +init (xlator_t *this) +{ + int ret = -1; + shard_priv_t *priv = NULL; + + if (!this) { + gf_log ("shard", GF_LOG_ERROR, "this is NULL. init() failed"); + goto out; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_ERROR, "Dangling volume. " + "Check volfile"); + goto out; + } + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, "shard not configured with " + "exactly one sub-volume. Check volfile"); + goto out; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_shard_mt_priv_t); + if (!priv) + goto out; + + GF_OPTION_INIT ("shard-block-size", priv->block_size, size_uint64, out); + + this->local_pool = mem_pool_new (shard_local_t, 128); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "Failed to allocate locals " + "from mempool"); + goto out; + } + priv->inode_table = inode_table_new (SHARD_INODE_LRU_LIMIT, this); + if (!priv->inode_table) + goto out; + + uuid_parse (SHARD_ROOT_GFID, priv->dot_shard_gfid); + + this->private = priv; + ret = 0; +out: + if (ret) { + inode_table_destroy (priv->inode_table); + GF_FREE (priv); + mem_pool_destroy (this->local_pool); + } + + return ret; + +} + +void +fini (xlator_t *this) +{ + shard_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("shard", this, out); + + mem_pool_destroy (this->local_pool); + this->local_pool = NULL; + + priv = this->private; + if (!priv) + goto out; + + this->private = NULL; + inode_table_destroy (priv->inode_table); + GF_FREE (priv); + +out: + return; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = -1; + shard_priv_t *priv = NULL; + + priv = this->private; + + GF_OPTION_RECONF ("shard-block-size", priv->block_size, options, size, + out); + + ret = 0; + +out: + return ret; +} + +int +shard_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + inode_ctx_del (inode, this, &ctx_uint); + if (!ctx_uint) + return 0; + + ctx = (shard_inode_ctx_t *)ctx_uint; + + GF_FREE (ctx); + + return 0; +} + +int +shard_release (xlator_t *this, fd_t *fd) +{ + /* TBD */ + return 0; +} + +int +shard_priv_dump (xlator_t *this) +{ + /* TBD */ + return 0; +} + +int +shard_releasedir (xlator_t *this, fd_t *fd) +{ + return 0; +} + +struct xlator_fops fops = { + .lookup = shard_lookup, + .open = shard_open, + .flush = shard_flush, + .fsync = shard_fsync, + .stat = shard_stat, + .fstat = shard_fstat, + .readv = shard_readv, + .writev = shard_writev, + .truncate = shard_truncate, + .ftruncate = shard_ftruncate, + .setattr = shard_setattr, + .fsetattr = shard_fsetattr, + .fallocate = shard_fallocate, + .discard = shard_discard, + .zerofill = shard_zerofill, + .readdir = shard_readdir, + .readdirp = shard_readdirp, + .create = shard_create, + .mknod = shard_mknod, + .unlink = shard_unlink, + .rename = shard_rename, +}; + +struct xlator_cbks cbks = { + .forget = shard_forget, + .release = shard_release, + .releasedir = shard_releasedir, +}; + +struct xlator_dumpops dumpops = { + .priv = shard_priv_dump, +}; + +struct volume_options options[] = { + { .key = {"shard-block-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "4MB", + .min = SHARD_MIN_BLOCK_SIZE, + .description = "Size of the shard unit that would be read from or " + "written to the shard servers.", + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h new file mode 100644 index 00000000000..03d881079e6 --- /dev/null +++ b/xlators/features/shard/src/shard.h @@ -0,0 +1,118 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __SHARD_H__ +#define __SHARD_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "compat-errno.h" + +#define GF_SHARD_DIR ".shard" +#define SHARD_MIN_BLOCK_SIZE (128*GF_UNIT_KB) +#define GF_XATTR_SHARD_BLOCK_SIZE "trusted.glusterfs.shard.block-size" +#define SHARD_ROOT_GFID "be318638-e8a0-4c6d-977d-7a937aa84806" +#define SHARD_INODE_LRU_LIMIT 4096 + +#define get_lowest_block(off, shard_size) (off / shard_size) +#define get_highest_block(off, len, shard_size) ((off+len-1) / shard_size) + +#define SHARD_ENTRY_FOP_CHECK(loc, op_errno, label) do { \ + if ((loc->name && !strcmp (GF_SHARD_DIR, loc->name)) && \ + (((loc->parent) && \ + __is_root_gfid (loc->parent->gfid)) || \ + __is_root_gfid (loc->pargfid))) { \ + op_errno = EPERM; \ + goto label; \ + } \ + \ + if ((loc->parent && \ + __is_shard_dir (loc->parent->gfid)) || \ + __is_shard_dir (loc->pargfid)) { \ + op_errno = EPERM; \ + goto label; \ + } \ +} while (0) + +#define SHARD_INODE_OP_CHECK(gfid, err, label) do { \ + if (__is_shard_dir(gfid)) { \ + err = EPERM; \ + goto label; \ + } \ +} while (0) + +#define SHARD_STACK_UNWIND(fop, frame, params ...) do { \ + shard_local_t *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + if (__local) { \ + shard_local_wipe (__local); \ + mem_put (__local); \ + } \ +} while (0) \ + +typedef struct shard_priv { + uint64_t block_size; + uuid_t dot_shard_gfid; + inode_table_t *inode_table; + inode_t *dot_shard_inode; +} shard_priv_t; + +typedef struct { + loc_t *loc; + short type; + char *domain; +} shard_lock_t; + +typedef struct shard_local { + int op_ret; + int op_errno; + int first_block; + int last_block; + int num_blocks; + int call_count; + int eexist_count; + int xflag; + int count; + uint32_t flags; + uint64_t block_size; + off_t offset; + size_t total_size; + uuid_t shard_gfid; + loc_t loc; + loc_t dot_shard_loc; + fd_t *fd; + dict_t *xattr_req; + inode_t **inode_list; + struct iovec *vector; + struct iobref *iobref; + struct { + int lock_count; + fop_inodelk_cbk_t inodelk_cbk; + shard_lock_t *shard_lock; + } lock; +} shard_local_t; + +typedef struct shard_inode_ctx { + uint32_t rdev; + uint64_t block_size; /* The block size with which this inode is + sharded */ + mode_t mode; +} shard_inode_ctx_t; + +#endif /* __SHARD_H__ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 6f6d1095edb..b098a297132 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -3353,6 +3353,18 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, if (ret == -1) goto out; + ret = dict_get_str_boolean (set_dict, "features.shard", _gf_false); + if (ret == -1) + goto out; + + if (ret) { + xl = volgen_graph_add (graph, "features/shard", volname); + if (!xl) { + ret = -1; + goto out; + } + } + /* As of now snapshot volume is read-only. Read-only xlator is loaded * in client graph so that AFR & DHT healing can be done in server. */ @@ -3388,7 +3400,6 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, goto out; if (ret) { xl = volgen_graph_add (graph, "encryption/crypt", volname); - if (!xl) { ret = -1; goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index ada814bb25d..68e3fb7d46b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1763,6 +1763,17 @@ struct volopt_map_entry glusterd_volopt_map[] = { .type = NO_DOC, .op_version = GD_OP_VERSION_3_7_0, }, + { .key = "features.shard", + .voltype = "features/shard", + .value = "off", + .op_version = GD_OP_VERSION_3_7_0, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "features.shard-block-size", + .voltype = "features/shard", + .op_version = GD_OP_VERSION_3_7_0, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = NULL } }; |